diff options
237 files changed, 19661 insertions, 9085 deletions
diff --git a/Android.bp b/Android.bp index 8708fa18b..c63961b5a 100644 --- a/Android.bp +++ b/Android.bp @@ -109,6 +109,7 @@ libvpx_arm_neon_c_srcs = [ "libvpx/vp9/decoder/vp9_dsubexp.c", "libvpx/vp9/decoder/vp9_job_queue.c", "libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c", + "libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c", "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c", "libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c", "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c", @@ -153,9 +154,9 @@ libvpx_arm_neon_c_srcs = [ "libvpx/vpx_dsp/arm/avg_pred_neon.c", "libvpx/vpx_dsp/arm/fdct16x16_neon.c", "libvpx/vpx_dsp/arm/fdct32x32_neon.c", - "libvpx/vpx_dsp/arm/fdct_neon.c", + "libvpx/vpx_dsp/arm/fdct4x4_neon.c", + "libvpx/vpx_dsp/arm/fdct8x8_neon.c", "libvpx/vpx_dsp/arm/fdct_partial_neon.c", - "libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "libvpx/vpx_dsp/arm/hadamard_neon.c", "libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c", "libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c", @@ -166,6 +167,9 @@ libvpx_arm_neon_c_srcs = [ "libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c", "libvpx/vpx_dsp/arm/highbd_intrapred_neon.c", "libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c", + "libvpx/vpx_dsp/arm/highbd_quantize_neon.c", + "libvpx/vpx_dsp/arm/highbd_sad_neon.c", + "libvpx/vpx_dsp/arm/highbd_variance_neon.c", "libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c", "libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c", "libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c", @@ -349,6 +353,7 @@ libvpx_arm64_c_srcs = [ "libvpx/vp9/decoder/vp9_dsubexp.c", "libvpx/vp9/decoder/vp9_job_queue.c", "libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c", + "libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c", "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c", "libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c", "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c", @@ -393,9 +398,9 @@ libvpx_arm64_c_srcs = [ "libvpx/vpx_dsp/arm/avg_pred_neon.c", "libvpx/vpx_dsp/arm/fdct16x16_neon.c", "libvpx/vpx_dsp/arm/fdct32x32_neon.c", - "libvpx/vpx_dsp/arm/fdct_neon.c", + "libvpx/vpx_dsp/arm/fdct4x4_neon.c", + "libvpx/vpx_dsp/arm/fdct8x8_neon.c", "libvpx/vpx_dsp/arm/fdct_partial_neon.c", - "libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "libvpx/vpx_dsp/arm/hadamard_neon.c", "libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c", "libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c", @@ -406,6 +411,9 @@ libvpx_arm64_c_srcs = [ "libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c", "libvpx/vpx_dsp/arm/highbd_intrapred_neon.c", "libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c", + "libvpx/vpx_dsp/arm/highbd_quantize_neon.c", + "libvpx/vpx_dsp/arm/highbd_sad_neon.c", + "libvpx/vpx_dsp/arm/highbd_variance_neon.c", "libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c", "libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c", "libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c", @@ -746,6 +754,7 @@ libvpx_x86_c_srcs = [ "libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c", "libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c", "libvpx/vp9/encoder/x86/vp9_quantize_sse2.c", + "libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c", "libvpx/vp9/vp9_cx_iface.c", "libvpx/vp9/vp9_dx_iface.c", "libvpx/vp9/vp9_iface_common.c", @@ -981,6 +990,7 @@ libvpx_x86_64_c_srcs = [ "libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c", "libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c", "libvpx/vp9/encoder/x86/vp9_quantize_sse2.c", + "libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c", "libvpx/vp9/vp9_cx_iface.c", "libvpx/vp9/vp9_dx_iface.c", "libvpx/vp9/vp9_iface_common.c", @@ -1062,7 +1072,6 @@ libvpx_x86_64_asm_srcs = [ "libvpx/vp8/encoder/x86/fwalsh_sse2.asm", "libvpx/vp9/encoder/x86/vp9_dct_sse2.asm", "libvpx/vp9/encoder/x86/vp9_error_sse2.asm", - "libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm", "libvpx/vpx_dsp/x86/add_noise_sse2.asm", "libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm", "libvpx/vpx_dsp/x86/deblock_sse2.asm", diff --git a/README.android b/README.android index 38780acec..4fb133063 100644 --- a/README.android +++ b/README.android @@ -1,12 +1,12 @@ Name: libvpx URL: http://www.webmproject.org -Version: v1.12.0 +Version: v1.13.0 License: BSD License File: libvpx/LICENSE -Date: Thursday June 30 2022 -Branch: origin/torrent -Commit: 03265cd42b3783532de72f2ded5436652e6f5ce3 +Date: Wednesday March 01 2023 +Branch: ugly-duckling +Commit: d6eb9696aa72473c1a11d34d928d35a3acc0c9a9 Description: Contains the sources used to compile libvpx. diff --git a/README.version b/README.version index 7dfba96ef..9858b39c9 100644 --- a/README.version +++ b/README.version @@ -1,5 +1,5 @@ -URL: https://chromium.googlesource.com/webm/libvpx/+archive/v1.12.0.tar.gz -Version: v1.12.0 +URL: https://chromium.googlesource.com/webm/libvpx/ +Version: v1.13.0 BugComponent: 42195 Owners: jzern, jianj Local Modifications: diff --git a/config/arm-neon/vp9_rtcd.h b/config/arm-neon/vp9_rtcd.h index 01065e667..b2b2fc2dc 100644 --- a/config/arm-neon/vp9_rtcd.h +++ b/config/arm-neon/vp9_rtcd.h @@ -38,7 +38,8 @@ int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, #define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -#define vp9_diamond_search_sad vp9_diamond_search_sad_c +int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_neon void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -62,7 +63,8 @@ void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, #define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c +void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_neon void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c @@ -83,10 +85,12 @@ void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c +void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_neon void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c +void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_neon void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count); #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h index 99abbb974..565105892 100644 --- a/config/arm-neon/vpx_dsp_rtcd.h +++ b/config/arm-neon/vpx_dsp_rtcd.h @@ -287,7 +287,8 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran #define vpx_hadamard_16x16 vpx_hadamard_16x16_neon void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); -#define vpx_hadamard_32x32 vpx_hadamard_32x32_c +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); @@ -297,409 +298,544 @@ void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c +void vpx_highbd_10_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_neon void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c +void vpx_highbd_10_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_neon unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c +unsigned int vpx_highbd_10_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_neon unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c +unsigned int vpx_highbd_10_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_neon unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c +unsigned int vpx_highbd_10_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_neon unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c +unsigned int vpx_highbd_10_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_neon uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c +uint32_t vpx_highbd_10_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_neon uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c +uint32_t vpx_highbd_10_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_neon uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c +uint32_t vpx_highbd_10_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_neon uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c +uint32_t vpx_highbd_10_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_neon uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c +uint32_t vpx_highbd_10_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_neon uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c +uint32_t vpx_highbd_10_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_neon uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c +uint32_t vpx_highbd_10_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_neon uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c +uint32_t vpx_highbd_10_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_neon uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c +uint32_t vpx_highbd_10_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_neon uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c +uint32_t vpx_highbd_10_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_neon uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c +uint32_t vpx_highbd_10_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_neon uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c +uint32_t vpx_highbd_10_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_neon uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c +uint32_t vpx_highbd_10_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_neon unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c +unsigned int vpx_highbd_10_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_neon unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c +unsigned int vpx_highbd_10_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_neon unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c +unsigned int vpx_highbd_10_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_neon unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c +unsigned int vpx_highbd_10_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_neon unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c +unsigned int vpx_highbd_10_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_neon unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c +unsigned int vpx_highbd_10_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_neon unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c +unsigned int vpx_highbd_10_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_neon unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c +unsigned int vpx_highbd_10_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_neon unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c +unsigned int vpx_highbd_10_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_neon unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c +unsigned int vpx_highbd_10_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_neon unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c +unsigned int vpx_highbd_10_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_neon unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c +unsigned int vpx_highbd_10_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_neon unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c +unsigned int vpx_highbd_10_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_neon void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c +void vpx_highbd_12_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_neon void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c +void vpx_highbd_12_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_neon unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c +unsigned int vpx_highbd_12_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_neon unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c +unsigned int vpx_highbd_12_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_neon unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c +unsigned int vpx_highbd_12_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_neon unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c +unsigned int vpx_highbd_12_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_neon uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c +uint32_t vpx_highbd_12_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_neon uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c +uint32_t vpx_highbd_12_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_neon uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c +uint32_t vpx_highbd_12_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_neon uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c +uint32_t vpx_highbd_12_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_neon uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c +uint32_t vpx_highbd_12_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_neon uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c +uint32_t vpx_highbd_12_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_neon uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c +uint32_t vpx_highbd_12_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_neon uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c +uint32_t vpx_highbd_12_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_neon uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c +uint32_t vpx_highbd_12_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_neon uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c +uint32_t vpx_highbd_12_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_neon uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c +uint32_t vpx_highbd_12_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_neon uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c +uint32_t vpx_highbd_12_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_neon uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c +uint32_t vpx_highbd_12_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_neon unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c +unsigned int vpx_highbd_12_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_neon unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c +unsigned int vpx_highbd_12_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_neon unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c +unsigned int vpx_highbd_12_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_neon unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c +unsigned int vpx_highbd_12_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_neon unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c +unsigned int vpx_highbd_12_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_neon unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c +unsigned int vpx_highbd_12_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_neon unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c +unsigned int vpx_highbd_12_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_neon unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c +unsigned int vpx_highbd_12_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_neon unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c +unsigned int vpx_highbd_12_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_neon unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c +unsigned int vpx_highbd_12_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_neon unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c +unsigned int vpx_highbd_12_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_neon unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c +unsigned int vpx_highbd_12_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_neon unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c +unsigned int vpx_highbd_12_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_neon void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c +void vpx_highbd_8_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_neon void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c +void vpx_highbd_8_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_neon unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c +unsigned int vpx_highbd_8_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_neon unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c +unsigned int vpx_highbd_8_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_neon unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c +unsigned int vpx_highbd_8_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_neon unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c +unsigned int vpx_highbd_8_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_neon uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c +uint32_t vpx_highbd_8_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_neon uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c +uint32_t vpx_highbd_8_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_neon uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c +uint32_t vpx_highbd_8_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_neon uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c +uint32_t vpx_highbd_8_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_neon uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c +uint32_t vpx_highbd_8_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_neon uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c +uint32_t vpx_highbd_8_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_neon uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c +uint32_t vpx_highbd_8_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_neon uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c +uint32_t vpx_highbd_8_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_neon uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c +uint32_t vpx_highbd_8_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_neon uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c +uint32_t vpx_highbd_8_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_neon uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c +uint32_t vpx_highbd_8_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_neon uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c +uint32_t vpx_highbd_8_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_neon uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c +uint32_t vpx_highbd_8_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_neon unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c +unsigned int vpx_highbd_8_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_neon unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c +unsigned int vpx_highbd_8_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_neon unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c +unsigned int vpx_highbd_8_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_neon unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c +unsigned int vpx_highbd_8_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_neon unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c +unsigned int vpx_highbd_8_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_neon unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c +unsigned int vpx_highbd_8_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_neon unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c +unsigned int vpx_highbd_8_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_neon unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c +unsigned int vpx_highbd_8_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_neon unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c +unsigned int vpx_highbd_8_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_neon unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c +unsigned int vpx_highbd_8_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_neon unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c +unsigned int vpx_highbd_8_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_neon unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c +unsigned int vpx_highbd_8_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_neon unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c +unsigned int vpx_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_neon unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p); #define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c @@ -708,7 +844,8 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p); #define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); -#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_neon void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); @@ -887,25 +1024,32 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const #define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c +void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_neon void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c +void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_neon void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c +void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_neon void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c +void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_neon void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c +void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_neon void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c +void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_neon void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c +void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_neon void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); @@ -1046,133 +1190,175 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c +void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_neon void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c +void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_neon unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c +unsigned int vpx_highbd_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_neon unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c +unsigned int vpx_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_neon void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c +void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_neon unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c +unsigned int vpx_highbd_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_neon unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c +unsigned int vpx_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_neon void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c +void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_neon unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c +unsigned int vpx_highbd_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_neon unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c +unsigned int vpx_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_neon void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c +void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_neon unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c +unsigned int vpx_highbd_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_neon unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c +unsigned int vpx_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_neon void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c +void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_neon unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c +unsigned int vpx_highbd_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_neon unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c +unsigned int vpx_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_neon void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c +void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_neon unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c +unsigned int vpx_highbd_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_neon unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c +unsigned int vpx_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_neon void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c +void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_neon unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c +unsigned int vpx_highbd_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_neon unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c +unsigned int vpx_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_neon void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c +void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_neon unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c +unsigned int vpx_highbd_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_neon unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c +unsigned int vpx_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_neon void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c +void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_neon unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c +unsigned int vpx_highbd_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_neon unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c +unsigned int vpx_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_neon void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c +void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_neon unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c +unsigned int vpx_highbd_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_neon unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c +unsigned int vpx_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_neon void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c +void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_neon unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c +unsigned int vpx_highbd_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_neon unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c +unsigned int vpx_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_neon void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c +void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_neon unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c +unsigned int vpx_highbd_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_neon unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c +unsigned int vpx_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_neon void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c +void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_neon unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c +unsigned int vpx_highbd_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_neon unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c +unsigned int vpx_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_neon void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c +void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_neon int vpx_highbd_satd_c(const tran_low_t *coeff, int length); #define vpx_highbd_satd vpx_highbd_satd_c void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); -#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c +void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_neon void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); diff --git a/config/arm-neon/vpx_version.h b/config/arm-neon/vpx_version.h index a90ab60d9..a58bfac01 100644 --- a/config/arm-neon/vpx_version.h +++ b/config/arm-neon/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 12 +#define VERSION_MINOR 13 #define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.12.0" -#define VERSION_STRING " v1.12.0" +#define VERSION_STRING_NOSP "v1.13.0" +#define VERSION_STRING " v1.13.0" diff --git a/config/arm64/vp9_rtcd.h b/config/arm64/vp9_rtcd.h index 01065e667..b2b2fc2dc 100644 --- a/config/arm64/vp9_rtcd.h +++ b/config/arm64/vp9_rtcd.h @@ -38,7 +38,8 @@ int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, #define vp9_block_error_fp vp9_block_error_fp_c int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -#define vp9_diamond_search_sad vp9_diamond_search_sad_c +int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_neon void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -62,7 +63,8 @@ void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, #define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c +void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_neon void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c @@ -83,10 +85,12 @@ void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c +void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_neon void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c +void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_neon void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count); #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h index 99abbb974..565105892 100644 --- a/config/arm64/vpx_dsp_rtcd.h +++ b/config/arm64/vpx_dsp_rtcd.h @@ -287,7 +287,8 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran #define vpx_hadamard_16x16 vpx_hadamard_16x16_neon void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); -#define vpx_hadamard_32x32 vpx_hadamard_32x32_c +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff); @@ -297,409 +298,544 @@ void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c +void vpx_highbd_10_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_neon void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c +void vpx_highbd_10_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_neon unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c +unsigned int vpx_highbd_10_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_neon unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c +unsigned int vpx_highbd_10_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_neon unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c +unsigned int vpx_highbd_10_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_neon unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c +unsigned int vpx_highbd_10_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_neon uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c +uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_neon uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c +uint32_t vpx_highbd_10_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_neon uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c +uint32_t vpx_highbd_10_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_neon uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c +uint32_t vpx_highbd_10_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_neon uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c +uint32_t vpx_highbd_10_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_neon uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c +uint32_t vpx_highbd_10_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_neon uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c +uint32_t vpx_highbd_10_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_neon uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c +uint32_t vpx_highbd_10_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_neon uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c +uint32_t vpx_highbd_10_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_neon uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c +uint32_t vpx_highbd_10_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_neon uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c +uint32_t vpx_highbd_10_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_neon uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c +uint32_t vpx_highbd_10_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_neon uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c +uint32_t vpx_highbd_10_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_neon uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c +uint32_t vpx_highbd_10_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_neon unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c +unsigned int vpx_highbd_10_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_neon unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c +unsigned int vpx_highbd_10_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_neon unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c +unsigned int vpx_highbd_10_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_neon unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c +unsigned int vpx_highbd_10_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_neon unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c +unsigned int vpx_highbd_10_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_neon unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c +unsigned int vpx_highbd_10_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_neon unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c +unsigned int vpx_highbd_10_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_neon unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c +unsigned int vpx_highbd_10_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_neon unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c +unsigned int vpx_highbd_10_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_neon unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c +unsigned int vpx_highbd_10_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_neon unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c +unsigned int vpx_highbd_10_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_neon unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c +unsigned int vpx_highbd_10_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_neon unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c +unsigned int vpx_highbd_10_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_neon void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c +void vpx_highbd_12_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_neon void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c +void vpx_highbd_12_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_neon unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c +unsigned int vpx_highbd_12_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_neon unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c +unsigned int vpx_highbd_12_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_neon unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c +unsigned int vpx_highbd_12_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_neon unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c +unsigned int vpx_highbd_12_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_neon uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c +uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_neon uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c +uint32_t vpx_highbd_12_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_neon uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c +uint32_t vpx_highbd_12_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_neon uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c +uint32_t vpx_highbd_12_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_neon uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c +uint32_t vpx_highbd_12_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_neon uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c +uint32_t vpx_highbd_12_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_neon uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c +uint32_t vpx_highbd_12_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_neon uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c +uint32_t vpx_highbd_12_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_neon uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c +uint32_t vpx_highbd_12_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_neon uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c +uint32_t vpx_highbd_12_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_neon uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c +uint32_t vpx_highbd_12_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_neon uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c +uint32_t vpx_highbd_12_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_neon uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c +uint32_t vpx_highbd_12_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_neon uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c +uint32_t vpx_highbd_12_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_neon unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c +unsigned int vpx_highbd_12_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_neon unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c +unsigned int vpx_highbd_12_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_neon unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c +unsigned int vpx_highbd_12_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_neon unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c +unsigned int vpx_highbd_12_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_neon unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c +unsigned int vpx_highbd_12_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_neon unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c +unsigned int vpx_highbd_12_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_neon unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c +unsigned int vpx_highbd_12_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_neon unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c +unsigned int vpx_highbd_12_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_neon unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c +unsigned int vpx_highbd_12_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_neon unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c +unsigned int vpx_highbd_12_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_neon unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c +unsigned int vpx_highbd_12_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_neon unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c +unsigned int vpx_highbd_12_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_neon unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c +unsigned int vpx_highbd_12_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_neon void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c +void vpx_highbd_8_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_neon void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c +void vpx_highbd_8_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_neon unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c +unsigned int vpx_highbd_8_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_neon unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c +unsigned int vpx_highbd_8_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_neon unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c +unsigned int vpx_highbd_8_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_neon unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c +unsigned int vpx_highbd_8_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_neon uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c +uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_neon uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c +uint32_t vpx_highbd_8_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_neon uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c +uint32_t vpx_highbd_8_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_neon uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c +uint32_t vpx_highbd_8_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_neon uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c +uint32_t vpx_highbd_8_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_neon uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c +uint32_t vpx_highbd_8_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_neon uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c +uint32_t vpx_highbd_8_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_neon uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c +uint32_t vpx_highbd_8_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_neon uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c +uint32_t vpx_highbd_8_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_neon uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c +uint32_t vpx_highbd_8_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_neon uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c +uint32_t vpx_highbd_8_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_neon uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c +uint32_t vpx_highbd_8_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_neon uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c +uint32_t vpx_highbd_8_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_neon uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c +uint32_t vpx_highbd_8_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_neon unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c +unsigned int vpx_highbd_8_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_neon unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c +unsigned int vpx_highbd_8_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_neon unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c +unsigned int vpx_highbd_8_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_neon unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c +unsigned int vpx_highbd_8_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_neon unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c +unsigned int vpx_highbd_8_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_neon unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c +unsigned int vpx_highbd_8_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_neon unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c +unsigned int vpx_highbd_8_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_neon unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c +unsigned int vpx_highbd_8_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_neon unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c +unsigned int vpx_highbd_8_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_neon unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c +unsigned int vpx_highbd_8_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_neon unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c +unsigned int vpx_highbd_8_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_neon unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c +unsigned int vpx_highbd_8_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_neon unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c +unsigned int vpx_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_neon unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p); #define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c @@ -708,7 +844,8 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p); #define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); -#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_neon void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); @@ -887,25 +1024,32 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const #define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c +void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_neon void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c +void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_neon void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c +void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_neon void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c +void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_neon void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c +void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_neon void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c +void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_neon void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c +void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_neon void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); @@ -1046,133 +1190,175 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c +void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b vpx_highbd_quantize_b_neon void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c +void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_neon unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c +unsigned int vpx_highbd_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_neon unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c +unsigned int vpx_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_neon void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c +void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_neon unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c +unsigned int vpx_highbd_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_neon unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c +unsigned int vpx_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_neon void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c +void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_neon unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c +unsigned int vpx_highbd_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_neon unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c +unsigned int vpx_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_neon void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c +void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_neon unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c +unsigned int vpx_highbd_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_neon unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c +unsigned int vpx_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_neon void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c +void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_neon unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c +unsigned int vpx_highbd_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_neon unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c +unsigned int vpx_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_neon void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c +void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_neon unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c +unsigned int vpx_highbd_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_neon unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c +unsigned int vpx_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_neon void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c +void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_neon unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c +unsigned int vpx_highbd_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_neon unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c +unsigned int vpx_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_neon void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c +void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_neon unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c +unsigned int vpx_highbd_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_neon unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c +unsigned int vpx_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_neon void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c +void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_neon unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c +unsigned int vpx_highbd_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_neon unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c +unsigned int vpx_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_neon void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c +void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_neon unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c +unsigned int vpx_highbd_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_neon unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c +unsigned int vpx_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_neon void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c +void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_neon unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c +unsigned int vpx_highbd_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_neon unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c +unsigned int vpx_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_neon void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c +void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_neon unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c +unsigned int vpx_highbd_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_neon unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c +unsigned int vpx_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_neon void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c +void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_neon unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c +unsigned int vpx_highbd_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_neon unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c +unsigned int vpx_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_neon void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); -#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c +void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_neon int vpx_highbd_satd_c(const tran_low_t *coeff, int length); #define vpx_highbd_satd vpx_highbd_satd_c void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); -#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c +void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd); +#define vpx_highbd_subtract_block vpx_highbd_subtract_block_neon void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd); diff --git a/config/arm64/vpx_version.h b/config/arm64/vpx_version.h index a90ab60d9..a58bfac01 100644 --- a/config/arm64/vpx_version.h +++ b/config/arm64/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 12 +#define VERSION_MINOR 13 #define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.12.0" -#define VERSION_STRING " v1.12.0" +#define VERSION_STRING_NOSP "v1.13.0" +#define VERSION_STRING " v1.13.0" diff --git a/config/generic/vpx_version.h b/config/generic/vpx_version.h index a90ab60d9..a58bfac01 100644 --- a/config/generic/vpx_version.h +++ b/config/generic/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 12 +#define VERSION_MINOR 13 #define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.12.0" -#define VERSION_STRING " v1.12.0" +#define VERSION_STRING_NOSP "v1.13.0" +#define VERSION_STRING " v1.13.0" diff --git a/config/x86/vp9_rtcd.h b/config/x86/vp9_rtcd.h index cff5e7f63..580d55a28 100644 --- a/config/x86/vp9_rtcd.h +++ b/config/x86/vp9_rtcd.h @@ -106,10 +106,12 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_quantize_fp vp9_quantize_fp_sse2 +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp vp9_quantize_fp_ssse3 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_ssse3 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h index 8b94dd89f..91242deee 100644 --- a/config/x86/vpx_dsp_rtcd.h +++ b/config/x86/vpx_dsp_rtcd.h @@ -833,7 +833,8 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p); #define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_sse2 void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); -#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c +void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_sse2 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); #define vpx_highbd_convolve8 vpx_highbd_convolve8_c diff --git a/config/x86/vpx_version.h b/config/x86/vpx_version.h index a90ab60d9..a58bfac01 100644 --- a/config/x86/vpx_version.h +++ b/config/x86/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 12 +#define VERSION_MINOR 13 #define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.12.0" -#define VERSION_STRING " v1.12.0" +#define VERSION_STRING_NOSP "v1.13.0" +#define VERSION_STRING " v1.13.0" diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h index 284453f06..22401f1c0 100644 --- a/config/x86_64/vpx_dsp_rtcd.h +++ b/config/x86_64/vpx_dsp_rtcd.h @@ -834,7 +834,8 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p); #define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_sse2 void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); -#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c +void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride); +#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_sse2 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); diff --git a/config/x86_64/vpx_version.h b/config/x86_64/vpx_version.h index a90ab60d9..a58bfac01 100644 --- a/config/x86_64/vpx_version.h +++ b/config/x86_64/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 12 +#define VERSION_MINOR 13 #define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.12.0" -#define VERSION_STRING " v1.12.0" +#define VERSION_STRING_NOSP "v1.13.0" +#define VERSION_STRING " v1.13.0" diff --git a/libvpx/.clang-format b/libvpx/.clang-format index 866b7e211..a8bc4967c 100644 --- a/libvpx/.clang-format +++ b/libvpx/.clang-format @@ -1,149 +1,9 @@ --- Language: Cpp -# BasedOnStyle: Google -# Generated with clang-format 7.0.1 -AccessModifierOffset: -1 -AlignAfterOpenBracket: Align -AlignConsecutiveAssignments: false -AlignConsecutiveDeclarations: false -AlignEscapedNewlines: Left -AlignOperands: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false +BasedOnStyle: Google AllowShortCaseLabelsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: true -BinPackArguments: true -BinPackParameters: true -BraceWrapping: - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakBeforeBinaryOperators: None -BreakBeforeBraces: Attach -BreakBeforeInheritanceComma: false -BreakInheritanceList: BeforeColon -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -BreakConstructorInitializers: BeforeColon -BreakAfterJavaFieldAnnotations: false -BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' -CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 Cpp11BracedListStyle: false DerivePointerAlignment: false -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IncludeBlocks: Preserve -IncludeCategories: - - Regex: '^<ext/.*\.h>' - Priority: 2 - - Regex: '^<.*\.h>' - Priority: 1 - - Regex: '^<.*' - Priority: 2 - - Regex: '.*' - Priority: 3 -IncludeIsMainRegex: '([-_](test|unittest))?$' -IndentCaseLabels: true -IndentPPDirectives: None -IndentWidth: 2 -IndentWrappedFunctionNames: false -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Never -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: false -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakTemplateDeclaration: 10 -PenaltyBreakString: 1000 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Right -RawStringFormats: - - Language: Cpp - Delimiters: - - cc - - CC - - cpp - - Cpp - - CPP - - 'c++' - - 'C++' - CanonicalDelimiter: '' - BasedOnStyle: google - - Language: TextProto - Delimiters: - - pb - - PB - - proto - - PROTO - EnclosingFunctions: - - EqualsProto - - EquivToProto - - PARSE_PARTIAL_TEXT_PROTO - - PARSE_TEST_PROTO - - PARSE_TEXT_PROTO - - ParseTextOrDie - - ParseTextProtoOrDie - CanonicalDelimiter: '' - BasedOnStyle: google -ReflowComments: true SortIncludes: false -SortUsingDeclarations: true -SpaceAfterCStyleCast: false -SpaceAfterTemplateKeyword: true -SpaceBeforeAssignmentOperators: true -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeRangeBasedForLoopColon: true -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -Standard: Auto -TabWidth: 8 -UseTab: Never -... - diff --git a/libvpx/.mailmap b/libvpx/.mailmap index 376ca83ae..bb0ddd95b 100644 --- a/libvpx/.mailmap +++ b/libvpx/.mailmap @@ -21,10 +21,11 @@ Jacky Chen <jackychen@google.com> Jim Bankoski <jimbankoski@google.com> Johann Koenig <johannkoenig@google.com> Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com> -Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com> Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org> +Johann <johann@duck.com> <johann.koenig@gmail.com> John Koleszar <jkoleszar@google.com> Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org> +Konstantinos Margaritis <konma@vectorcamp.gr> <konstantinos@vectorcamp.gr> Marco Paniconi <marpan@google.com> Marco Paniconi <marpan@google.com> <marpan@chromium.org> Martin Storsjö <martin@martin.st> diff --git a/libvpx/AUTHORS b/libvpx/AUTHORS index fffda6336..2db4a113e 100644 --- a/libvpx/AUTHORS +++ b/libvpx/AUTHORS @@ -21,8 +21,10 @@ Andoni Morales Alastruey <ylatuya@gmail.com> Andres Mejia <mcitadel@gmail.com> Andrew Lewis <andrewlewis@google.com> Andrew Russell <anrussell@google.com> +Andrew Salkeld <andrew.salkeld@arm.com> Angie Chen <yunqi@google.com> Angie Chiang <angiebird@google.com> +Anton Venema <anton.venema@liveswitch.com> Aron Rosenberg <arosenberg@logitech.com> Attila Nagy <attilanagy@google.com> Birk Magnussen <birk.magnussen@googlemail.com> @@ -174,7 +176,9 @@ Rob Bradford <rob@linux.intel.com> Ronald S. Bultje <rsbultje@gmail.com> Rui Ueyama <ruiu@google.com> Sai Deng <sdeng@google.com> +Salome Thirot <salome.thirot@arm.com> Sami Pietilä <samipietila@google.com> +Sam James <sam@gentoo.org> Sarah Parker <sarahparker@google.com> Sasi Inguva <isasi@google.com> Scott Graham <scottmg@chromium.org> diff --git a/libvpx/CHANGELOG b/libvpx/CHANGELOG index cd4e8ba43..3fb2d19bb 100644 --- a/libvpx/CHANGELOG +++ b/libvpx/CHANGELOG @@ -1,3 +1,39 @@ +2023-01-31 v1.13.0 "Ugly Duckling" + This release includes more Neon and AVX2 optimizations, adds a new codec + control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes + numerous bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + + New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP. + + GoogleTest is upgraded to v1.12.1. + + .clang-format is upgraded to clang-format-11. + + VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the + feature of using external rate control models for vp9. + + - Enhancement: + Numerous improvements on Neon optimizations. + Numerous improvements on AVX2 optimizations. + Additional ARM targets added for Visual Studio. + + - Bug fixes: + Fix to calculating internal stats when frame dropped. + Fix to segfault for external resize test in vp9. + Fix to build system with replacing egrep with grep -E. + Fix to a few bugs with external RTC rate control library. + Fix to make SVC work with VBR. + Fix to key frame setting in VP9 external RC. + Fix to -Wimplicit-int (Clang 16). + Fix to VP8 external RC for buffer levels. + Fix to VP8 external RC for dynamic update of layers. + Fix to VP9 auto level. + Fix to off-by-one error of max w/h in validate_config. + Fix to make SVC work for Profile 1. + 2022-06-17 v1.12.0 "Torrent Duck" This release adds optimizations for Loongarch, adds support for vp8 in the real-time rate control library, upgrades GoogleTest to v1.11.0, updates @@ -36,6 +72,7 @@ levels, and includes several improvements to NEON and numerous bug fixes. - Upgrading: + This release is ABI incompatible with the previous release. New codec control is added to get quantization parameters and loop filter levels. @@ -61,6 +98,7 @@ well as numerous bug fixes. - Upgrading: + This release is ABI incompatible with the previous release. New codec control is added to disable loopfilter for VP9. New encoder control is added to disable feature to increase Q on overshoot @@ -91,6 +129,7 @@ well as incremental improvements. - Upgrading: + This release is ABI compatible with the previous release. NV12 support is added to this release. A new interface is added for VP9 rate control. The new library libvp9rc.a must be linked by applications. @@ -114,12 +153,14 @@ This release collects incremental improvements to many aspects of the library. - Upgrading: + This release is ABI compatible with the previous release. ARCH_* defines have been removed in favor of VPX_ARCH_*. 2019-07-15 v1.8.1 "Orpington Duck" This release collects incremental improvements to many aspects of the library. - Upgrading: + This release is ABI incompatible with the previous release. VP8E_SET_CPUUSED now accepts values up to 9 for vp9. VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E). The --sdk-path option has been removed. If you were using it to build for @@ -138,7 +179,8 @@ This release focused on encoding performance for realtime and VOD use cases. - Upgrading: - This adds and improves several vp9 controls. Most are related to SVC: + This release is ABI incompatible with the previous release. This adds and + improves several vp9 controls. Most are related to SVC: VP9E_SET_SVC_FRAME_DROP_LAYER: - Frame dropping in SVC. VP9E_SET_SVC_INTER_LAYER_PRED: diff --git a/libvpx/build/make/Android.mk b/libvpx/build/make/Android.mk index b8032e67a..ba24f541b 100644 --- a/libvpx/build/make/Android.mk +++ b/libvpx/build/make/Android.mk @@ -8,6 +8,8 @@ ## be found in the AUTHORS file in the root of the source tree. ## +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT # # This file is to be used for compiling libvpx for Android using the NDK. # In an Android project place a libvpx checkout in the jni directory. @@ -212,3 +214,4 @@ endif ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) $(call import-module,android/cpufeatures) endif +endif # NDK_ROOT diff --git a/libvpx/build/make/Makefile b/libvpx/build/make/Makefile index b7a873cc8..5c38c18e5 100644 --- a/libvpx/build/make/Makefile +++ b/libvpx/build/make/Makefile @@ -21,9 +21,9 @@ all: .DEFAULT clean:: .DEFAULT exampletest: .DEFAULT install:: .DEFAULT -test:: .DEFAULT -test-no-data-check:: .DEFAULT -testdata:: .DEFAULT +test: .DEFAULT +test-no-data-check: .DEFAULT +testdata: .DEFAULT utiltest: .DEFAULT exampletest-no-data-check utiltest-no-data-check: .DEFAULT test_%: .DEFAULT ; @@ -111,13 +111,13 @@ exampletest: .PHONY: install install:: .PHONY: test -test:: +test: .PHONY: testdata -testdata:: +testdata: .PHONY: utiltest utiltest: .PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check -test-no-data-check:: +test-no-data-check: exampletest-no-data-check utiltest-no-data-check: # Force to realign stack always on OS/2 @@ -465,6 +465,6 @@ INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins all: $(BUILD_TARGETS) install:: $(INSTALL_TARGETS) dist: $(INSTALL_TARGETS) -test:: +test: .SUFFIXES: # Delete default suffix rules diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh index 581042e38..4bf090f00 100755 --- a/libvpx/build/make/configure.sh +++ b/libvpx/build/make/configure.sh @@ -791,7 +791,7 @@ process_common_toolchain() { tgt_isa=x86_64 tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'` ;; - *darwin2[0-1]*) + *darwin2[0-2]*) tgt_isa=`uname -m` tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'` ;; @@ -940,7 +940,7 @@ process_common_toolchain() { add_cflags "-mmacosx-version-min=10.15" add_ldflags "-mmacosx-version-min=10.15" ;; - *-darwin2[0-1]-*) + *-darwin2[0-2]-*) add_cflags "-arch ${toolchain%%-*}" add_ldflags "-arch ${toolchain%%-*}" ;; @@ -1511,7 +1511,7 @@ EOF # Try to find which inline keywords are supported check_cc <<EOF && INLINE="inline" -static inline function() {} +static inline int function(void) {} EOF # Almost every platform uses pthreads. diff --git a/libvpx/build/make/gen_asm_deps.sh b/libvpx/build/make/gen_asm_deps.sh index 6a7bff9eb..3bd4d125f 100755 --- a/libvpx/build/make/gen_asm_deps.sh +++ b/libvpx/build/make/gen_asm_deps.sh @@ -42,7 +42,7 @@ done [ -n "$srcfile" ] || show_help sfx=${sfx:-asm} -includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile | +includes=$(LC_ALL=C grep -E -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile | perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;") #" restore editor state for inc in ${includes}; do diff --git a/libvpx/build/make/rtcd.pl b/libvpx/build/make/rtcd.pl index 9c9726842..f4edeaad5 100755 --- a/libvpx/build/make/rtcd.pl +++ b/libvpx/build/make/rtcd.pl @@ -488,7 +488,8 @@ if ($opts{arch} eq 'x86') { arm; } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { @ALL_ARCHS = filter(qw/neon/); - &require("neon"); + @REQUIRES = filter(qw/neon/); + &require(@REQUIRES); arm; } elsif ($opts{arch} =~ /^ppc/ ) { @ALL_ARCHS = filter(qw/vsx/); diff --git a/libvpx/configure b/libvpx/configure index beea65032..ae289f77b 100755 --- a/libvpx/configure +++ b/libvpx/configure @@ -101,9 +101,12 @@ all_platforms="${all_platforms} arm64-android-gcc" all_platforms="${all_platforms} arm64-darwin-gcc" all_platforms="${all_platforms} arm64-darwin20-gcc" all_platforms="${all_platforms} arm64-darwin21-gcc" +all_platforms="${all_platforms} arm64-darwin22-gcc" all_platforms="${all_platforms} arm64-linux-gcc" all_platforms="${all_platforms} arm64-win64-gcc" all_platforms="${all_platforms} arm64-win64-vs15" +all_platforms="${all_platforms} arm64-win64-vs16" +all_platforms="${all_platforms} arm64-win64-vs17" all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8 @@ -112,6 +115,8 @@ all_platforms="${all_platforms} armv7-none-rvct" #neon Cortex-A8 all_platforms="${all_platforms} armv7-win32-gcc" all_platforms="${all_platforms} armv7-win32-vs14" all_platforms="${all_platforms} armv7-win32-vs15" +all_platforms="${all_platforms} armv7-win32-vs16" +all_platforms="${all_platforms} armv7-win32-vs17" all_platforms="${all_platforms} armv7s-darwin-gcc" all_platforms="${all_platforms} armv8-linux-gcc" all_platforms="${all_platforms} loongarch32-linux-gcc" @@ -157,6 +162,7 @@ all_platforms="${all_platforms} x86_64-darwin18-gcc" all_platforms="${all_platforms} x86_64-darwin19-gcc" all_platforms="${all_platforms} x86_64-darwin20-gcc" all_platforms="${all_platforms} x86_64-darwin21-gcc" +all_platforms="${all_platforms} x86_64-darwin22-gcc" all_platforms="${all_platforms} x86_64-iphonesimulator-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" @@ -666,11 +672,18 @@ process_toolchain() { check_add_cxxflags -Wno-psabi fi + # Enforce C++11 compatibility. + check_add_cxxflags -Wc++14-extensions + check_add_cxxflags -Wc++17-extensions + check_add_cxxflags -Wc++20-extensions + # disable some warnings specific to libyuv. check_cxxflags -Wno-missing-declarations \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations" check_cxxflags -Wno-missing-prototypes \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes" + check_cxxflags -Wno-pass-failed \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed" check_cxxflags -Wno-unused-parameter \ && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter" fi diff --git a/libvpx/examples/svc_encodeframe.c b/libvpx/examples/svc_encodeframe.c index 08bda0e5c..003096e70 100644 --- a/libvpx/examples/svc_encodeframe.c +++ b/libvpx/examples/svc_encodeframe.c @@ -552,11 +552,8 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, iter = NULL; while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) { switch (cx_pkt->kind) { - case VPX_CODEC_PSNR_PKT: { - } - ++si->psnr_pkt_received; - break; - default: { break; } + case VPX_CODEC_PSNR_PKT: ++si->psnr_pkt_received; break; + default: break; } } diff --git a/libvpx/examples/vp9_spatial_svc_encoder.c b/libvpx/examples/vp9_spatial_svc_encoder.c index e85dbf8e7..d287e5831 100644 --- a/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/libvpx/examples/vp9_spatial_svc_encoder.c @@ -1146,7 +1146,9 @@ int main(int argc, const char **argv) { cx_pkt->data.twopass_stats.sz); break; } - default: { break; } + default: { + break; + } } #if CONFIG_VP9_DECODER && !SIMULCAST_MODE diff --git a/libvpx/libs.doxy_template b/libvpx/libs.doxy_template index 1eacc8fe2..1ee442af3 100644 --- a/libvpx/libs.doxy_template +++ b/libvpx/libs.doxy_template @@ -654,12 +654,6 @@ VERBATIM_HEADERS = YES ALPHABETICAL_INDEX = NO -# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then -# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns -# in which this list will be split (can be a number in the range [1..20]) - -COLS_IN_ALPHA_INDEX = 5 - # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that @@ -1099,32 +1093,10 @@ ALLEXTERNALS = NO EXTERNAL_GROUPS = YES -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of `which perl'). - -PERL_PATH = /usr/bin/perl - #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- -# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will -# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base -# or super classes. Setting the tag to NO turns the diagrams off. Note that -# this option is superseded by the HAVE_DOT option below. This is only a -# fallback. It is recommended to install and use dot, since it yields more -# powerful graphs. - -CLASS_DIAGRAMS = YES - -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to -# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to -# specify the directory where the mscgen tool resides. If left empty the tool is assumed to -# be found in the default search path. - -MSCGEN_PATH = - # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. @@ -1138,10 +1110,14 @@ HIDE_UNDOC_RELATIONS = YES HAVE_DOT = NO -# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect inheritance relations. Setting this tag to YES will force the -# the CLASS_DIAGRAMS tag to NO. +# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a +# graph for each documented class showing the direct and indirect inheritance +# relations. In case HAVE_DOT is set as well dot will be used to draw the graph, +# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set +# to TEXT the direct and indirect inheritance relations will be shown as texts / +# links. +# Possible values are: NO, YES, TEXT and GRAPH. +# The default value is: YES. CLASS_GRAPH = YES diff --git a/libvpx/libs.mk b/libvpx/libs.mk index 00e49a19d..1f7f03aa3 100644 --- a/libvpx/libs.mk +++ b/libvpx/libs.mk @@ -312,8 +312,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current # SO_VERSION_* then follow the rules in the link to detemine the new version # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 -SO_VERSION_MAJOR := 7 -SO_VERSION_MINOR := 1 +SO_VERSION_MAJOR := 8 +SO_VERSION_MINOR := 0 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib @@ -446,13 +446,13 @@ ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes) # YASM $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h @echo " [CREATE] $@" - @LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \ + @LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \ | awk '{print $$2 " equ " $$3}' > $@ else ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION)) $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h @echo " [CREATE] $@" - @LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \ + @LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \ | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@ @echo " END" $(ADS2GAS) >> $@ CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm @@ -536,7 +536,7 @@ $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1 esac \ ) -testdata:: $(LIBVPX_TEST_DATA) +testdata: $(LIBVPX_TEST_DATA) $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\ [ -x "$$(which shasum)" ] && sha1sum=shasum;\ [ -x "$$(which sha1)" ] && sha1sum=sha1;\ @@ -709,15 +709,15 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(RC_INTERFACE_TEST_SRCS) define test_shard_template -test:: test_shard.$(1) -test-no-data-check:: test_shard_ndc.$(1) +test: test_shard.$(1) +test-no-data-check: test_shard_ndc.$(1) test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN) @set -e; \ export GTEST_SHARD_INDEX=$(1); \ export GTEST_TOTAL_SHARDS=$(2); \ $(LIBVPX_TEST_BIN) test_shard.$(1): testdata -.PHONY: test_shard.$(1) +.PHONY: test_shard.$(1) test_shard_ndc.$(1) endef NUM_SHARDS := 10 diff --git a/libvpx/md5_utils.c b/libvpx/md5_utils.c index c4106525f..abd8d43c3 100644 --- a/libvpx/md5_utils.c +++ b/libvpx/md5_utils.c @@ -151,8 +151,8 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) { * reflect the addition of 16 longwords of new data. MD5Update blocks * the data and converts bytes into longwords for this routine. */ -VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4], - UWORD32 const in[16]) { +VPX_NO_UNSIGNED_OVERFLOW_CHECK VPX_NO_UNSIGNED_SHIFT_CHECK void MD5Transform( + UWORD32 buf[4], UWORD32 const in[16]) { UWORD32 a, b, c, d; a = buf[0]; diff --git a/libvpx/test/acm_random.h b/libvpx/test/acm_random.h index 3458340a1..c7122b933 100644 --- a/libvpx/test/acm_random.h +++ b/libvpx/test/acm_random.h @@ -28,43 +28,43 @@ class ACMRandom { explicit ACMRandom(int seed) : random_(seed) {} void Reset(int seed) { random_.Reseed(seed); } - uint16_t Rand16(void) { + uint16_t Rand16() { const uint32_t value = random_.Generate(testing::internal::Random::kMaxRange); return (value >> 15) & 0xffff; } - int32_t Rand20Signed(void) { + int32_t Rand20Signed() { // Use 20 bits: values between 524287 and -524288. const uint32_t value = random_.Generate(1048576); return static_cast<int32_t>(value) - 524288; } - int16_t Rand16Signed(void) { + int16_t Rand16Signed() { // Use 16 bits: values between 32767 and -32768. return static_cast<int16_t>(random_.Generate(65536)); } - int16_t Rand13Signed(void) { + int16_t Rand13Signed() { // Use 13 bits: values between 4095 and -4096. const uint32_t value = random_.Generate(8192); return static_cast<int16_t>(value) - 4096; } - int16_t Rand9Signed(void) { + int16_t Rand9Signed() { // Use 9 bits: values between 255 (0x0FF) and -256 (0x100). const uint32_t value = random_.Generate(512); return static_cast<int16_t>(value) - 256; } - uint8_t Rand8(void) { + uint8_t Rand8() { const uint32_t value = random_.Generate(testing::internal::Random::kMaxRange); // There's a bit more entropy in the upper bits of this implementation. return (value >> 23) & 0xff; } - uint8_t Rand8Extremes(void) { + uint8_t Rand8Extremes() { // Returns a random value near 0 or near 255, to better exercise // saturation behavior. const uint8_t r = Rand8(); @@ -82,7 +82,7 @@ class ACMRandom { int operator()(int n) { return PseudoUniform(n); } - static int DeterministicSeed(void) { return 0xbaba; } + static int DeterministicSeed() { return 0xbaba; } private: testing::internal::Random random_; diff --git a/libvpx/test/android/Android.mk b/libvpx/test/android/Android.mk index 87155fcb5..9a7533ebb 100644 --- a/libvpx/test/android/Android.mk +++ b/libvpx/test/android/Android.mk @@ -10,6 +10,9 @@ # The test app itself runs on the command line through adb shell # The paths are really messed up as the libvpx make file # expects to be made from a parent directory. + +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT CUR_WD := $(call my-dir) BINDINGS_DIR := $(CUR_WD)/../../.. LOCAL_PATH := $(CUR_WD)/../../.. @@ -61,3 +64,4 @@ LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC)) # some test files depend on *_rtcd.h, ensure they're generated first. $(eval $(call rtcd_dep_template)) include $(BUILD_EXECUTABLE) +endif # NDK_ROOT diff --git a/libvpx/test/comp_avg_pred_test.cc b/libvpx/test/comp_avg_pred_test.cc index 3977a2d0b..70aeab8d7 100644 --- a/libvpx/test/comp_avg_pred_test.cc +++ b/libvpx/test/comp_avg_pred_test.cc @@ -22,13 +22,14 @@ namespace { using ::libvpx_test::ACMRandom; using ::libvpx_test::Buffer; -typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h, - const uint8_t *c, int c_stride); - -uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; } +template <typename Pixel> +Pixel avg_with_rounding(Pixel a, Pixel b) { + return (a + b + 1) >> 1; +} -void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref, - int width, int height, Buffer<uint8_t> *avg) { +template <typename Pixel> +void reference_pred(const Buffer<Pixel> &pred, const Buffer<Pixel> &ref, + int width, int height, Buffer<Pixel> *avg) { ASSERT_NE(avg->TopLeftPixel(), nullptr); ASSERT_NE(pred.TopLeftPixel(), nullptr); ASSERT_NE(ref.TopLeftPixel(), nullptr); @@ -36,12 +37,16 @@ void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref, for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { avg->TopLeftPixel()[y * avg->stride() + x] = - avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x], - ref.TopLeftPixel()[y * ref.stride() + x]); + avg_with_rounding<Pixel>(pred.TopLeftPixel()[y * pred.stride() + x], + ref.TopLeftPixel()[y * ref.stride() + x]); } } } +using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h, + const uint8_t *c, int c_stride); + +template <int bitdepth, typename Pixel> class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> { public: virtual void SetUp() { @@ -49,15 +54,19 @@ class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> { rnd_.Reset(ACMRandom::DeterministicSeed()); } + void TestSizeCombinations(); + void TestCompareReferenceRandom(); + void TestSpeed(); + protected: AvgPredFunc avg_pred_func_; ACMRandom rnd_; }; -TEST_P(AvgPredTest, SizeCombinations) { +template <int bitdepth, typename Pixel> +void AvgPredTest<bitdepth, Pixel>::TestSizeCombinations() { // This is called as part of the sub pixel variance. As such it must be one of // the variance block sizes. - for (int width_pow = 2; width_pow <= 6; ++width_pow) { for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; ++height_pow) { @@ -70,23 +79,30 @@ TEST_P(AvgPredTest, SizeCombinations) { const int width = 1 << width_pow; const int height = 1 << height_pow; // Only the reference buffer may have a stride not equal to width. - Buffer<uint8_t> ref = - Buffer<uint8_t>(width, height, ref_padding ? 8 : 0); + Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0); ASSERT_TRUE(ref.Init()); - Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16); + Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16); ASSERT_TRUE(pred.Init()); - Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16); + Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16); ASSERT_TRUE(avg_ref.Init()); - Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16); + Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16); ASSERT_TRUE(avg_chk.Init()); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } - ref.Set(&rnd_, &ACMRandom::Rand8); - pred.Set(&rnd_, &ACMRandom::Rand8); - - reference_pred(pred, ref, width, height, &avg_ref); - ASM_REGISTER_STATE_CHECK( - avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width, - height, ref.TopLeftPixel(), ref.stride())); + reference_pred<Pixel>(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK(avg_pred_func_( + (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), + width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride())); EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); if (HasFailure()) { @@ -99,26 +115,36 @@ TEST_P(AvgPredTest, SizeCombinations) { } } -TEST_P(AvgPredTest, CompareReferenceRandom) { +template <int bitdepth, typename Pixel> +void AvgPredTest<bitdepth, Pixel>::TestCompareReferenceRandom() { const int width = 64; const int height = 32; - Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8); + Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8); ASSERT_TRUE(ref.Init()); - Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16); + Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16); ASSERT_TRUE(pred.Init()); - Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16); + Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16); ASSERT_TRUE(avg_ref.Init()); - Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16); + Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16); ASSERT_TRUE(avg_chk.Init()); for (int i = 0; i < 500; ++i) { - ref.Set(&rnd_, &ACMRandom::Rand8); - pred.Set(&rnd_, &ACMRandom::Rand8); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } - reference_pred(pred, ref, width, height, &avg_ref); - ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(), - pred.TopLeftPixel(), width, height, - ref.TopLeftPixel(), ref.stride())); + reference_pred<Pixel>(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK(avg_pred_func_( + (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), + width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride())); EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); if (HasFailure()) { printf("Width: %d Height: %d\n", width, height); @@ -128,7 +154,8 @@ TEST_P(AvgPredTest, CompareReferenceRandom) { } } -TEST_P(AvgPredTest, DISABLED_Speed) { +template <int bitdepth, typename Pixel> +void AvgPredTest<bitdepth, Pixel>::TestSpeed() { for (int width_pow = 2; width_pow <= 6; ++width_pow) { for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; ++height_pow) { @@ -138,22 +165,30 @@ TEST_P(AvgPredTest, DISABLED_Speed) { for (int ref_padding = 0; ref_padding < 2; ref_padding++) { const int width = 1 << width_pow; const int height = 1 << height_pow; - Buffer<uint8_t> ref = - Buffer<uint8_t>(width, height, ref_padding ? 8 : 0); + Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0); ASSERT_TRUE(ref.Init()); - Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16); + Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16); ASSERT_TRUE(pred.Init()); - Buffer<uint8_t> avg = Buffer<uint8_t>(width, height, 0, 16); + Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 16); ASSERT_TRUE(avg.Init()); - - ref.Set(&rnd_, &ACMRandom::Rand8); - pred.Set(&rnd_, &ACMRandom::Rand8); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } vpx_usec_timer timer; vpx_usec_timer_start(&timer); - for (int i = 0; i < 10000000 / (width * height); ++i) { - avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height, - ref.TopLeftPixel(), ref.stride()); + for (int i = 0; i < 100000000 / (width * height); ++i) { + avg_pred_func_((uint8_t *)avg.TopLeftPixel(), + (uint8_t *)pred.TopLeftPixel(), width, height, + (uint8_t *)ref.TopLeftPixel(), ref.stride()); } vpx_usec_timer_mark(&timer); @@ -166,26 +201,64 @@ TEST_P(AvgPredTest, DISABLED_Speed) { } } -INSTANTIATE_TEST_SUITE_P(C, AvgPredTest, +using AvgPredTestLBD = AvgPredTest<8, uint8_t>; + +TEST_P(AvgPredTestLBD, SizeCombinations) { TestSizeCombinations(); } + +TEST_P(AvgPredTestLBD, CompareReferenceRandom) { TestCompareReferenceRandom(); } + +TEST_P(AvgPredTestLBD, DISABLED_Speed) { TestSpeed(); } + +INSTANTIATE_TEST_SUITE_P(C, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_c)); #if HAVE_SSE2 -INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTest, +INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_sse2)); #endif // HAVE_SSE2 #if HAVE_NEON -INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTest, +INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_neon)); #endif // HAVE_NEON #if HAVE_VSX -INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest, +INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_vsx)); #endif // HAVE_VSX #if HAVE_LSX -INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTest, +INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTestLBD, ::testing::Values(&vpx_comp_avg_pred_lsx)); #endif // HAVE_LSX + +#if CONFIG_VP9_HIGHBITDEPTH +using HighbdAvgPredFunc = void (*)(uint16_t *a, const uint16_t *b, int w, int h, + const uint16_t *c, int c_stride); + +template <HighbdAvgPredFunc fn> +void highbd_wrapper(uint8_t *a, const uint8_t *b, int w, int h, + const uint8_t *c, int c_stride) { + fn((uint16_t *)a, (const uint16_t *)b, w, h, (const uint16_t *)c, c_stride); +} + +using AvgPredTestHBD = AvgPredTest<12, uint16_t>; + +TEST_P(AvgPredTestHBD, SizeCombinations) { TestSizeCombinations(); } + +TEST_P(AvgPredTestHBD, CompareReferenceRandom) { TestCompareReferenceRandom(); } + +TEST_P(AvgPredTestHBD, DISABLED_Speed) { TestSpeed(); } + +INSTANTIATE_TEST_SUITE_P( + C, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>)); +#endif // HAVE_SSE2 + +#endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc index 06837d809..d4ef7ae13 100644 --- a/libvpx/test/dct16x16_test.cc +++ b/libvpx/test/dct16x16_test.cc @@ -789,13 +789,23 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH -#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_SUITE_P( NEON, Trans16x16DCT, ::testing::Values(make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8))); #endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + NEON, Trans16x16DCT, + ::testing::Values( + make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_10, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_12, 0, VPX_BITS_12), + make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_c, 0, + VPX_BITS_8))); +#endif // HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16DCT, diff --git a/libvpx/test/dct_partial_test.cc b/libvpx/test/dct_partial_test.cc index 8d0e3a912..e57fa0f48 100644 --- a/libvpx/test/dct_partial_test.cc +++ b/libvpx/test/dct_partial_test.cc @@ -145,11 +145,17 @@ INSTANTIATE_TEST_SUITE_P( #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_SUITE_P( NEON, PartialFdctTest, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8), - make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8), + ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_8), make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12), make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10), make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_12), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_10), make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); #else INSTANTIATE_TEST_SUITE_P( diff --git a/libvpx/test/dct_test.cc b/libvpx/test/dct_test.cc index 2182f87e5..0304029bd 100644 --- a/libvpx/test/dct_test.cc +++ b/libvpx/test/dct_test.cc @@ -539,6 +539,18 @@ INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT, #endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_neon_func_info[] = { + { &fdct_wrapper<vpx_highbd_fdct4x4_neon>, + &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_neon>, 4, 2 }, + { &fdct_wrapper<vpx_highbd_fdct8x8_neon>, + &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 }, + { &fdct_wrapper<vpx_highbd_fdct16x16_neon>, + &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 }, + /* { &fdct_wrapper<vpx_highbd_fdct32x32_neon>, + &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/ +}; +#else static const FuncInfo dct_neon_func_info[4] = { { &fdct_wrapper<vpx_fdct4x4_neon>, &idct_wrapper<vpx_idct4x4_16_add_neon>, 4, 1 }, @@ -549,12 +561,15 @@ static const FuncInfo dct_neon_func_info[4] = { { &fdct_wrapper<vpx_fdct32x32_neon>, &idct_wrapper<vpx_idct32x32_1024_add_neon>, 32, 1 } }; +#endif // CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_SUITE_P( NEON, TransDCT, - ::testing::Combine(::testing::Range(0, 4), - ::testing::Values(dct_neon_func_info), - ::testing::Values(0), ::testing::Values(VPX_BITS_8))); + ::testing::Combine( + ::testing::Range(0, static_cast<int>(sizeof(dct_neon_func_info) / + sizeof(dct_neon_func_info[0]))), + ::testing::Values(dct_neon_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); #endif // HAVE_NEON #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH @@ -652,6 +667,8 @@ static const FuncInfo ht_neon_func_info[] = { #if CONFIG_VP9_HIGHBITDEPTH { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 2 }, + { &vp9_highbd_fht4x4_neon, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, + 4, 2 }, { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8, 2 }, { &vp9_highbd_fht16x16_c, diff --git a/libvpx/test/encode_api_test.cc b/libvpx/test/encode_api_test.cc index 6f61c7750..ecdf92834 100644 --- a/libvpx/test/encode_api_test.cc +++ b/libvpx/test/encode_api_test.cc @@ -233,8 +233,8 @@ TEST(EncodeAPI, SetRoi) { roi.roi_map = roi_map; // VP8 only. This value isn't range checked. roi.static_threshold[1] = 1000; - roi.static_threshold[2] = INT_MIN; - roi.static_threshold[3] = INT_MAX; + roi.static_threshold[2] = UINT_MAX / 2 + 1; + roi.static_threshold[3] = UINT_MAX; for (const auto delta : { -63, -1, 0, 1, 63 }) { for (int i = 0; i < 8; ++i) { @@ -336,7 +336,7 @@ TEST(EncodeAPI, ConfigChangeThreadCount) { for (const auto *iface : kCodecIfaces) { SCOPED_TRACE(vpx_codec_iface_name(iface)); for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { - vpx_codec_enc_cfg_t cfg; + vpx_codec_enc_cfg_t cfg = {}; struct Encoder { ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } vpx_codec_ctx_t ctx = {}; diff --git a/libvpx/test/encode_test_driver.cc b/libvpx/test/encode_test_driver.cc index 1ce39eaef..d3feeee34 100644 --- a/libvpx/test/encode_test_driver.cc +++ b/libvpx/test/encode_test_driver.cc @@ -52,7 +52,8 @@ void Encoder::InitEncoder(VideoSource *video) { } } -void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) { +void Encoder::EncodeFrame(VideoSource *video, + const vpx_enc_frame_flags_t frame_flags) { if (video->img()) { EncodeFrameInternal(*video, frame_flags); } else { @@ -70,7 +71,7 @@ void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) { } void Encoder::EncodeFrameInternal(const VideoSource &video, - const unsigned long frame_flags) { + const vpx_enc_frame_flags_t frame_flags) { vpx_codec_err_t res; const vpx_image_t *img = video.img(); @@ -169,7 +170,7 @@ void EncoderTest::RunLoop(VideoSource *video) { ASSERT_TRUE(passes_ == 1 || passes_ == 2); for (unsigned int pass = 0; pass < passes_; pass++) { - last_pts_ = 0; + vpx_codec_pts_t last_pts = 0; if (passes_ == 1) { cfg_.g_pass = VPX_RC_ONE_PASS; @@ -225,8 +226,8 @@ void EncoderTest::RunLoop(VideoSource *video) { has_dxdata = true; } - ASSERT_GE(pkt->data.frame.pts, last_pts_); - last_pts_ = pkt->data.frame.pts; + ASSERT_GE(pkt->data.frame.pts, last_pts); + last_pts = pkt->data.frame.pts; FramePktHook(pkt); break; diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h index 7085945f6..b57df8529 100644 --- a/libvpx/test/encode_test_driver.h +++ b/libvpx/test/encode_test_driver.h @@ -103,7 +103,7 @@ class Encoder { } // This is a thin wrapper around vpx_codec_encode(), so refer to // vpx_encoder.h for its semantics. - void EncodeFrame(VideoSource *video, const unsigned long frame_flags); + void EncodeFrame(VideoSource *video, vpx_enc_frame_flags_t frame_flags); // Convenience wrapper for EncodeFrame() void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); } @@ -184,7 +184,7 @@ class Encoder { // Encode an image void EncodeFrameInternal(const VideoSource &video, - const unsigned long frame_flags); + vpx_enc_frame_flags_t frame_flags); // Flush the encoder on EOS void Flush(); @@ -206,8 +206,7 @@ class Encoder { class EncoderTest { protected: explicit EncoderTest(const CodecFactory *codec) - : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0), - last_pts_(0) { + : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0) { // Default to 1 thread. cfg_.g_threads = 1; } @@ -290,8 +289,7 @@ class EncoderTest { unsigned long deadline_; TwopassStatsStore stats_; unsigned long init_flags_; - unsigned long frame_flags_; - vpx_codec_pts_t last_pts_; + vpx_enc_frame_flags_t frame_flags_; }; } // namespace libvpx_test diff --git a/libvpx/test/error_resilience_test.cc b/libvpx/test/error_resilience_test.cc index 45a327ec2..45138f14b 100644 --- a/libvpx/test/error_resilience_test.cc +++ b/libvpx/test/error_resilience_test.cc @@ -496,7 +496,7 @@ class ErrorResilienceTestLargeCodecControls ++tot_frame_number_; } - virtual void EndPassHook(void) { + virtual void EndPassHook() { duration_ = (last_pts_ + 1) * timebase_; if (cfg_.ts_number_layers > 1) { for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers); diff --git a/libvpx/test/frame_size_tests.cc b/libvpx/test/frame_size_tests.cc index d85c193e0..8a0eb71ba 100644 --- a/libvpx/test/frame_size_tests.cc +++ b/libvpx/test/frame_size_tests.cc @@ -111,7 +111,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, ASSERT_TRUE(passes_ == 1 || passes_ == 2); for (unsigned int pass = 0; pass < passes_; pass++) { - last_pts_ = 0; + vpx_codec_pts_t last_pts = 0; if (passes_ == 1) { cfg_.g_pass = VPX_RC_ONE_PASS; @@ -144,8 +144,8 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, again = true; switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: - ASSERT_GE(pkt->data.frame.pts, last_pts_); - last_pts_ = pkt->data.frame.pts; + ASSERT_GE(pkt->data.frame.pts, last_pts); + last_pts = pkt->data.frame.pts; FramePktHook(pkt); break; diff --git a/libvpx/test/hadamard_test.cc b/libvpx/test/hadamard_test.cc index 10b1e79c1..f904e814a 100644 --- a/libvpx/test/hadamard_test.cc +++ b/libvpx/test/hadamard_test.cc @@ -264,7 +264,8 @@ INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P( NEON, HadamardLowbdTest, ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8), - HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16))); + HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_neon, 32))); #endif // HAVE_NEON // TODO(jingning): Remove highbitdepth flag when the SIMD functions are diff --git a/libvpx/test/md5_helper.h b/libvpx/test/md5_helper.h index dc28dc628..9095d96a8 100644 --- a/libvpx/test/md5_helper.h +++ b/libvpx/test/md5_helper.h @@ -47,7 +47,7 @@ class MD5 { MD5Update(&md5_, data, static_cast<uint32_t>(size)); } - const char *Get(void) { + const char *Get() { static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', diff --git a/libvpx/test/pp_filter_test.cc b/libvpx/test/pp_filter_test.cc index 775f7f36a..27d5ffa90 100644 --- a/libvpx/test/pp_filter_test.cc +++ b/libvpx/test/pp_filter_test.cc @@ -7,7 +7,11 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + #include <limits.h> + +#include <memory> + #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" @@ -458,14 +462,13 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) { SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); - unsigned char *expected_output = new unsigned char[rows_ * cols_]; + std::unique_ptr<unsigned char[]> expected_output( + new unsigned char[rows_ * cols_]); ASSERT_NE(expected_output, nullptr); - SetRows(expected_output, rows_, cols_, cols_); + SetRows(expected_output.get(), rows_, cols_, cols_); RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0), - expected_output); - - delete[] expected_output; + expected_output.get()); } TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) { diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc index c57170ff9..715bb9d70 100644 --- a/libvpx/test/resize_test.cc +++ b/libvpx/test/resize_test.cc @@ -95,10 +95,11 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, unsigned int initial_h, unsigned int *w, unsigned int *h, bool flag_codec, bool smaller_width_larger_size_) { + *w = initial_w; + *h = initial_h; + if (smaller_width_larger_size_) { if (frame < 30) { - *w = initial_w; - *h = initial_h; return; } if (frame < 100) { @@ -109,8 +110,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 10) { - *w = initial_w; - *h = initial_h; return; } if (frame < 20) { @@ -124,8 +123,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 40) { - *w = initial_w; - *h = initial_h; return; } if (frame < 50) { @@ -139,8 +136,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 70) { - *w = initial_w; - *h = initial_h; return; } if (frame < 80) { @@ -159,8 +154,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 110) { - *w = initial_w; - *h = initial_h; return; } if (frame < 120) { @@ -179,8 +172,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 150) { - *w = initial_w; - *h = initial_h; return; } if (frame < 160) { @@ -199,8 +190,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 190) { - *w = initial_w; - *h = initial_h; return; } if (frame < 200) { @@ -219,8 +208,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 230) { - *w = initial_w; - *h = initial_h; return; } if (frame < 240) { @@ -234,8 +221,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 260) { - *w = initial_w; - *h = initial_h; return; } // Go down very low. @@ -248,13 +233,9 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, // Cases that only works for VP9. // For VP9: Swap width and height of original. if (frame < 320) { - *w = initial_h; - *h = initial_w; return; } } - *w = initial_w; - *h = initial_h; } class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { @@ -578,6 +559,8 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { } } +// TODO(https://crbug.com/webm/1642): This causes a segfault in +// init_encode_frame_mb_context(). TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) { ResizingVideoSource video; video.flag_codec_ = true; @@ -794,8 +777,7 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) { } VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); -VP9_INSTANTIATE_TEST_SUITE(ResizeTest, - ::testing::Values(::libvpx_test::kRealTime)); +VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest, ::testing::Values(::libvpx_test::kOnePassBest)); VP9_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest, diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc index 2506f1adb..0896c77f1 100644 --- a/libvpx/test/sad_test.cc +++ b/libvpx/test/sad_test.cc @@ -311,13 +311,13 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> { ASSERT_EQ(reference_sad, exp_sad); } - void Run() { + void Run() override { params_.func(source_data_, source_stride_, reference_data_, reference_stride_); } }; -class SADavgTest : public SADTestBase<SadMxNAvgParam> { +class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> { public: SADavgTest() : SADTestBase(GetParam()) {} @@ -338,6 +338,11 @@ class SADavgTest : public SADTestBase<SadMxNAvgParam> { ASSERT_EQ(reference_sad, exp_sad); } + + void Run() override { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_, second_pred_); + } }; TEST_P(SADTest, MaxRef) { @@ -437,6 +442,19 @@ TEST_P(SADavgTest, ShortSrc) { source_stride_ = tmp_stride; } +TEST_P(SADavgTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, params_.width); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + TEST_P(SADx4Test, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(GetReference(0), reference_stride_, mask_); @@ -517,14 +535,12 @@ TEST_P(SADx4Test, DISABLED_Speed) { uint32_t reference_sad[4]; DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); vpx_usec_timer timer; - - memset(reference_sad, 0, sizeof(reference_sad)); - SADs(exp_sad); + for (int block = 0; block < 4; ++block) { + reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block)); + } vpx_usec_timer_start(&timer); for (int i = 0; i < kCountSpeedTestBlock; ++i) { - for (int block = 0; block < 4; ++block) { - reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block)); - } + SADs(exp_sad); } vpx_usec_timer_mark(&timer); for (int block = 0; block < 4; ++block) { @@ -729,6 +745,45 @@ const SadMxNParam neon_tests[] = { SadMxNParam(8, 4, &vpx_sad8x4_neon), SadMxNParam(4, 8, &vpx_sad4x8_neon), SadMxNParam(4, 4, &vpx_sad4x4_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 8), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 8), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 8), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 8), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 8), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 8), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 8), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 8), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 8), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 8), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 8), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 8), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 10), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 10), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 10), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 10), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 10), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 10), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 10), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 10), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 10), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 10), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 12), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 12), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 12), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 12), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 12), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 12), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 12), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 12), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 12), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 12), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH + }; INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); @@ -746,6 +801,47 @@ const SadMxNAvgParam avg_neon_tests[] = { SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon), SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon), SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 8), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 8), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 8), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 8), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 8), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 8), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 8), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 8), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 8), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 8), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 8), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 8), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 8), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 10), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 10), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 10), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 10), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 10), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 10), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 10), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 10), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 10), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 10), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 10), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 10), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 10), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 12), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 12), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 12), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 12), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 12), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 12), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 12), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 12), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 12), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 12), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 12), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 12), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); @@ -763,6 +859,44 @@ const SadMxNx4Param x4d_neon_tests[] = { SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon), SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon), SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 8), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 8), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 8), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 8), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 8), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 8), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 8), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 8), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 8), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 10), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 10), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 10), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 10), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 10), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 10), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 10), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 10), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 10), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 12), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 12), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 12), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 12), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 12), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 12), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 12), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 12), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 12), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); #endif // HAVE_NEON @@ -948,6 +1082,34 @@ const SadMxNParam avx2_tests[] = { SadMxNParam(32, 64, &vpx_sad32x64_avx2), SadMxNParam(32, 32, &vpx_sad32x32_avx2), SadMxNParam(32, 16, &vpx_sad32x16_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 8), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 8), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8), + + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 10), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10), + + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 12), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 12), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); @@ -957,12 +1119,64 @@ const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_avx2), SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2), SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 8), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 8), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 10), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 10), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 12), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 12), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); const SadMxNx4Param x4d_avx2_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 8), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 8), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 10), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 10), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 12), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 12), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); diff --git a/libvpx/test/svc_datarate_test.cc b/libvpx/test/svc_datarate_test.cc index 291cb0128..484252ca4 100644 --- a/libvpx/test/svc_datarate_test.cc +++ b/libvpx/test/svc_datarate_test.cc @@ -548,13 +548,16 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } if (!single_layer_resize_) { - ASSERT_EQ(pkt->data.frame.width[sl], - top_sl_width_ * svc_params_.scaling_factor_num[sl] / - svc_params_.scaling_factor_den[sl]); - - ASSERT_EQ(pkt->data.frame.height[sl], - top_sl_height_ * svc_params_.scaling_factor_num[sl] / - svc_params_.scaling_factor_den[sl]); + unsigned int scaled_width = top_sl_width_ * + svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]; + if (scaled_width % 2 != 0) scaled_width += 1; + ASSERT_EQ(pkt->data.frame.width[sl], scaled_width); + unsigned int scaled_height = top_sl_height_ * + svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]; + if (scaled_height % 2 != 0) scaled_height += 1; + ASSERT_EQ(pkt->data.frame.height[sl], scaled_height); } else if (superframe_count_ > 0) { if (pkt->data.frame.width[sl] < prev_frame_width[sl] && pkt->data.frame.height[sl] < prev_frame_height[sl]) @@ -568,7 +571,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { } } - virtual void EndPassHook(void) { + virtual void EndPassHook() { if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_; duration_ = (last_pts_ + 1) * timebase_; for (int sl = 0; sl < number_spatial_layers_; ++sl) { @@ -678,6 +681,152 @@ class DatarateOnePassCbrSvcSingleBR } }; +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for 4:4:4 Profile 1. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL444Profile1) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); + cfg_.g_profile = 1; + cfg_.g_bit_depth = VPX_BITS_8; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 352; + top_sl_height_ = 288; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 +// temporal layers, for 4:2:2 Profile 1. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) { + SetSvcConfig(2, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_8_422.y4m", 0, 20); + cfg_.g_profile = 1; + cfg_.g_bit_depth = VPX_BITS_8; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +#if CONFIG_VP9_HIGHBITDEPTH +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for Profle 2 10bit. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL10bitProfile2) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_10_420_20f.y4m", 0, 20); + cfg_.g_profile = 2; + cfg_.g_bit_depth = VPX_BITS_10; + cfg_.g_input_bit_depth = VPX_BITS_10; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(marpan/jianj): Comment out the rate-target checking for now + // as superframe parsing to get frame size needs to be fixed for + // high bitdepth. + /* + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); + */ +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for Profle 2 12bit. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL12bitProfile2) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_12_420_20f.y4m", 0, 20); + cfg_.g_profile = 2; + cfg_.g_bit_depth = VPX_BITS_12; + cfg_.g_input_bit_depth = VPX_BITS_12; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(marpan/jianj): Comment out the rate-target checking for now + // as superframe parsing to get frame size needs to be fixed for + // high bitdepth. + /* + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); + */ +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} +#endif + // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1 // temporal layer, with screen content mode on and same speed setting for all // layers. @@ -1054,6 +1203,37 @@ TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) { #endif } +// Check basic rate targeting for 1 pass VBR SVC: 2 spatial layers and +// 3 temporal layers. Run VGA clip with 1 thread. +TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassVbrSvc2SL3TL) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_VBR; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)]; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70, + 1.3); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + // Params: speed setting, layer framedrop control and index for bitrate array. class DatarateOnePassCbrSvcFrameDropMultiBR : public DatarateOnePassCbrSvc, diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk index 6df457290..f60d8f823 100644 --- a/libvpx/test/test.mk +++ b/libvpx/test/test.mk @@ -59,6 +59,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h LIBVPX_TEST_SRCS-yes += decode_test_driver.cc LIBVPX_TEST_SRCS-yes += decode_test_driver.h diff --git a/libvpx/test/tools_common.sh b/libvpx/test/tools_common.sh index 844a12534..0e4a0a5c0 100755 --- a/libvpx/test/tools_common.sh +++ b/libvpx/test/tools_common.sh @@ -133,7 +133,7 @@ vpx_config_option_enabled() { vpx_config_option="${1}" vpx_config_file="${LIBVPX_CONFIG_PATH}/vpx_config.h" config_line=$(grep "${vpx_config_option}" "${vpx_config_file}") - if echo "${config_line}" | egrep -q '1$'; then + if echo "${config_line}" | grep -E -q '1$'; then echo yes fi } @@ -222,7 +222,7 @@ filter_strings() { if [ -n "${filter}" ]; then for s in ${strings}; do - if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then + if echo "${s}" | grep -E -q ${exclude} "${filter}" > /dev/null 2>&1; then filtered_strings="${filtered_strings} ${s}" fi done diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc index 80855052d..a6c8ef048 100644 --- a/libvpx/test/variance_test.cc +++ b/libvpx/test/variance_test.cc @@ -488,8 +488,8 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() { } vpx_usec_timer_mark(&timer); const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); - printf("Variance %dx%d time: %5d ms\n", width(), height(), - elapsed_time / 1000); + printf("Variance %dx%d %dbpp time: %5d ms\n", width(), height(), + params_.bit_depth, elapsed_time / 1000); } //////////////////////////////////////////////////////////////////////////////// @@ -499,14 +499,21 @@ template <typename FunctionType> void MainTestClass<FunctionType>::RefTestMse() { for (int i = 0; i < 10; ++i) { for (int j = 0; j < block_size(); ++j) { - src_[j] = rnd_.Rand8(); - ref_[j] = rnd_.Rand8(); + if (!use_high_bit_depth()) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); +#endif // CONFIG_VP9_HIGHBITDEPTH + } } unsigned int sse1, sse2; const int stride = width(); ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1)); variance_ref(src_, ref_, params_.log2width, params_.log2height, stride, - stride, &sse2, false, VPX_BITS_8); + stride, &sse2, use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2); } } @@ -530,8 +537,15 @@ void MainTestClass<FunctionType>::RefTestSse() { template <typename FunctionType> void MainTestClass<FunctionType>::MaxTestMse() { - memset(src_, 255, block_size()); - memset(ref_, 0, block_size()); + if (!use_high_bit_depth()) { + memset(src_, 255, block_size()); + memset(ref_, 0, block_size()); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, block_size()); +#endif // CONFIG_VP9_HIGHBITDEPTH + } unsigned int sse; ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse)); const unsigned int expected = block_size() * 255 * 255; @@ -854,25 +868,25 @@ TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } -/* TODO(debargha): This test does not support the highbd version typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest; TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); } TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); } +TEST_P(VpxHBDMseTest, DISABLED_Speed) { SpeedTest(); } INSTANTIATE_TEST_SUITE_P( C, VpxHBDMseTest, - ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c), - MseParams(4, 4, &vpx_highbd_12_mse16x8_c), - MseParams(4, 4, &vpx_highbd_12_mse8x16_c), - MseParams(4, 4, &vpx_highbd_12_mse8x8_c), - MseParams(4, 4, &vpx_highbd_10_mse16x16_c), - MseParams(4, 4, &vpx_highbd_10_mse16x8_c), - MseParams(4, 4, &vpx_highbd_10_mse8x16_c), - MseParams(4, 4, &vpx_highbd_10_mse8x8_c), - MseParams(4, 4, &vpx_highbd_8_mse16x16_c), - MseParams(4, 4, &vpx_highbd_8_mse16x8_c), - MseParams(4, 4, &vpx_highbd_8_mse8x16_c), - MseParams(4, 4, &vpx_highbd_8_mse8x8_c))); -*/ + ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_c, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_c, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_c, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_c, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_c, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_c, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_c, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_c, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_c, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_c, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_c, VPX_BITS_8))); + GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VpxHBDMseTest); INSTANTIATE_TEST_SUITE_P( @@ -1138,22 +1152,15 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); #if CONFIG_VP9_HIGHBITDEPTH -/* TODO(debargha): This test does not support the highbd version INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDMseTest, - ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2), - MseParams(4, 3, &vpx_highbd_12_mse16x8_sse2), - MseParams(3, 4, &vpx_highbd_12_mse8x16_sse2), - MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2), - MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2), - MseParams(4, 3, &vpx_highbd_10_mse16x8_sse2), - MseParams(3, 4, &vpx_highbd_10_mse8x16_sse2), - MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2), - MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2), - MseParams(4, 3, &vpx_highbd_8_mse16x8_sse2), - MseParams(3, 4, &vpx_highbd_8_mse8x16_sse2), - MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2))); -*/ + ::testing::Values( + MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2, VPX_BITS_8))); INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDVarianceTest, @@ -1495,6 +1502,224 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0), SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0), SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0))); + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDVarianceTest, + ::testing::Values( + VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12), + VarianceParams(6, 5, &vpx_highbd_12_variance64x32_neon, 12), + VarianceParams(5, 6, &vpx_highbd_12_variance32x64_neon, 12), + VarianceParams(5, 5, &vpx_highbd_12_variance32x32_neon, 12), + VarianceParams(5, 4, &vpx_highbd_12_variance32x16_neon, 12), + VarianceParams(4, 5, &vpx_highbd_12_variance16x32_neon, 12), + VarianceParams(4, 4, &vpx_highbd_12_variance16x16_neon, 12), + VarianceParams(4, 3, &vpx_highbd_12_variance16x8_neon, 12), + VarianceParams(3, 4, &vpx_highbd_12_variance8x16_neon, 12), + VarianceParams(3, 3, &vpx_highbd_12_variance8x8_neon, 12), + VarianceParams(3, 2, &vpx_highbd_12_variance8x4_neon, 12), + VarianceParams(2, 3, &vpx_highbd_12_variance4x8_neon, 12), + VarianceParams(2, 2, &vpx_highbd_12_variance4x4_neon, 12), + VarianceParams(6, 6, &vpx_highbd_10_variance64x64_neon, 10), + VarianceParams(6, 5, &vpx_highbd_10_variance64x32_neon, 10), + VarianceParams(5, 6, &vpx_highbd_10_variance32x64_neon, 10), + VarianceParams(5, 5, &vpx_highbd_10_variance32x32_neon, 10), + VarianceParams(5, 4, &vpx_highbd_10_variance32x16_neon, 10), + VarianceParams(4, 5, &vpx_highbd_10_variance16x32_neon, 10), + VarianceParams(4, 4, &vpx_highbd_10_variance16x16_neon, 10), + VarianceParams(4, 3, &vpx_highbd_10_variance16x8_neon, 10), + VarianceParams(3, 4, &vpx_highbd_10_variance8x16_neon, 10), + VarianceParams(3, 3, &vpx_highbd_10_variance8x8_neon, 10), + VarianceParams(3, 2, &vpx_highbd_10_variance8x4_neon, 10), + VarianceParams(2, 3, &vpx_highbd_10_variance4x8_neon, 10), + VarianceParams(2, 2, &vpx_highbd_10_variance4x4_neon, 10), + VarianceParams(6, 6, &vpx_highbd_8_variance64x64_neon, 8), + VarianceParams(6, 5, &vpx_highbd_8_variance64x32_neon, 8), + VarianceParams(5, 6, &vpx_highbd_8_variance32x64_neon, 8), + VarianceParams(5, 5, &vpx_highbd_8_variance32x32_neon, 8), + VarianceParams(5, 4, &vpx_highbd_8_variance32x16_neon, 8), + VarianceParams(4, 5, &vpx_highbd_8_variance16x32_neon, 8), + VarianceParams(4, 4, &vpx_highbd_8_variance16x16_neon, 8), + VarianceParams(4, 3, &vpx_highbd_8_variance16x8_neon, 8), + VarianceParams(3, 4, &vpx_highbd_8_variance8x16_neon, 8), + VarianceParams(3, 3, &vpx_highbd_8_variance8x8_neon, 8), + VarianceParams(3, 2, &vpx_highbd_8_variance8x4_neon, 8), + VarianceParams(2, 3, &vpx_highbd_8_variance4x8_neon, 8), + VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_neon, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_neon, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_neon, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_neon, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_neon, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_neon, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_neon, + 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_neon, + 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_neon, + 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon, + 12), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_neon, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_neon, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_neon, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_neon, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_neon, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_neon, + 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_neon, + 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_neon, + 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon, + 10), + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon, + 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon, + 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_neon, + 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_neon, + 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_neon, + 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_neon, + 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_neon, + 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_neon, + 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon, + 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, + 8))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_neon, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_neon, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_neon, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_neon, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_neon, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_neon, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_neon, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_neon, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_neon, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_neon, + 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_neon, + 12), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_neon, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_neon, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_neon, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_neon, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_neon, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_neon, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_neon, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_neon, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_neon, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_neon, + 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_neon, + 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_neon, + 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_neon, + 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_neon, + 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_neon, + 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_neon, + 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_neon, + 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_neon, + 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_neon, + 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_neon, + 8), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_8_sub_pixel_avg_variance8x8_neon, + 8), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_8_sub_pixel_avg_variance8x4_neon, + 8))); + +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON #if HAVE_MSA diff --git a/libvpx/test/vp8_datarate_test.cc b/libvpx/test/vp8_datarate_test.cc index dcd68a2d4..64a861d15 100644 --- a/libvpx/test/vp8_datarate_test.cc +++ b/libvpx/test/vp8_datarate_test.cc @@ -121,7 +121,7 @@ class DatarateTestLarge ++frame_number_; } - virtual void EndPassHook(void) { + virtual void EndPassHook() { if (bits_total_) { const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit diff --git a/libvpx/test/vp8_ratectrl_rtc_test.cc b/libvpx/test/vp8_ratectrl_rtc_test.cc index ad310666e..7410f3c01 100644 --- a/libvpx/test/vp8_ratectrl_rtc_test.cc +++ b/libvpx/test/vp8_ratectrl_rtc_test.cc @@ -127,8 +127,7 @@ class Vp8RcInterfaceTest encoder->Control(VP8E_SET_CPUUSED, -6); encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); - } - if (frame_params_.frame_type == INTER_FRAME) { + } else if (frame_params_.frame_type == INTER_FRAME) { // Disable golden frame update. frame_flags_ |= VP8_EFLAG_NO_UPD_GF; frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; diff --git a/libvpx/test/vp9_datarate_test.cc b/libvpx/test/vp9_datarate_test.cc index 9930c754c..7e9180749 100644 --- a/libvpx/test/vp9_datarate_test.cc +++ b/libvpx/test/vp9_datarate_test.cc @@ -9,6 +9,7 @@ */ #include "./vpx_config.h" #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" @@ -147,14 +148,16 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { if (video->frame() == 0) { encoder->Control(VP9E_SET_SVC, 1); } - vpx_svc_layer_id_t layer_id; - layer_id.spatial_layer_id = 0; - frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers); - layer_id.temporal_layer_id = - SetLayerId(video->frame(), cfg_.ts_number_layers); - layer_id.temporal_layer_id_per_spatial[0] = - SetLayerId(video->frame(), cfg_.ts_number_layers); - encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + vpx_svc_layer_id_t layer_id; + frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers); + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = + SetLayerId(video->frame(), cfg_.ts_number_layers); + layer_id.temporal_layer_id_per_spatial[0] = + SetLayerId(video->frame(), cfg_.ts_number_layers); + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + } } const vpx_rational_t tb = video->timebase(); timebase_ = static_cast<double>(tb.num) / tb.den; @@ -199,7 +202,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest { ++tot_frame_number_; } - virtual void EndPassHook(void) { + virtual void EndPassHook() { for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers); ++layer) { duration_ = (last_pts_ + 1) * timebase_; @@ -809,6 +812,135 @@ TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) { << " The datarate for the file is greater than target by too much!"; } +using libvpx_test::ACMRandom; + +class DatarateTestVP9FrameQp + : public DatarateTestVP9, + public ::testing::TestWithParam<const libvpx_test::CodecFactory *> { + public: + DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {} + virtual ~DatarateTestVP9FrameQp() {} + + protected: + virtual void SetUp() { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + ResetModel(); + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + set_cpu_used_ = 7; + DatarateTestVP9::PreEncodeFrameHook(video, encoder); + frame_qp_ = static_cast<int>(rnd_.RandRange(64)); + encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_); + frame_++; + } + + virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + int qp = 0; + vpx_svc_layer_id_t layer_id; + if (frame_ >= total_frame_) return; + encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp); + ASSERT_EQ(frame_qp_, qp); + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + temporal_layer_id_ = layer_id.temporal_layer_id; + } + + virtual void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) { + if (frame_ >= total_frame_) return; + ASSERT_TRUE(cfg_.temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_0212 && + temporal_layer_id_ == 2); + } + + protected: + int total_frame_; + + private: + ACMRandom rnd_; + int frame_qp_; + int frame_; + int temporal_layer_id_; +}; + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + cfg_.rc_target_bitrate = 200; + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212; + cfg_.rc_target_bitrate = 200; + cfg_.g_error_resilient = 1; + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + #if CONFIG_VP9_TEMPORAL_DENOISING // Params: speed setting. class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime { @@ -943,6 +1075,13 @@ VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9), VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTime, ::testing::Range(5, 10)); +#if CONFIG_VP9 +INSTANTIATE_TEST_SUITE_P( + VP9, DatarateTestVP9FrameQp, + ::testing::Values( + static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9))); +#endif + VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDeltaQUV, ::testing::Range(5, 10), ::testing::Values(-5, -10, -15)); diff --git a/libvpx/test/vp9_ext_ratectrl_test.cc b/libvpx/test/vp9_ext_ratectrl_test.cc index 60a350b84..2bfa6281d 100644 --- a/libvpx/test/vp9_ext_ratectrl_test.cc +++ b/libvpx/test/vp9_ext_ratectrl_test.cc @@ -16,28 +16,50 @@ #include "test/util.h" #include "test/yuv_video_source.h" #include "third_party/googletest/src/include/gtest/gtest.h" +#include "vp9/simple_encode.h" #include "vpx/vpx_ext_ratectrl.h" +#include "vpx_dsp/vpx_dsp_common.h" namespace { constexpr int kModelMagicNumber = 51396; constexpr uintptr_t PrivMagicNumber = 5566; constexpr int kFrameNum = 5; +constexpr int kFrameNumGOP = 30; +constexpr int kFrameNumGOPShort = 4; constexpr int kLosslessCodingIndex = 2; +constexpr int kFixedGOPSize = 9; +// The range check in vp9_cx_iface.c shows that the max +// lag in buffer is MAX_LAG_BUFFERS (25): +// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); +constexpr int kMaxLagInFrames = 25; +constexpr int kDefaultMinGfInterval = 4; +constexpr int kDefaultMaxGfInterval = 16; +// The active gf interval might change for each GOP +// See function "get_active_gf_inverval_range". +// The numbers below are from manual inspection. +constexpr int kReadMinGfInterval = 5; +constexpr int kReadMaxGfInterval = 13; +const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv"; +const double kPsnrThreshold = 30.50; struct ToyRateCtrl { int magic_number; int coding_index; + + int gop_global_index; + int frames_since_key; + int show_index; }; vpx_rc_status_t rc_create_model(void *priv, const vpx_rc_config_t *ratectrl_config, - vpx_rc_model_t *rate_ctrl_model_pt) { + vpx_rc_model_t *rate_ctrl_model_ptr) { ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; - EXPECT_NE(toy_rate_ctrl, nullptr); + if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; toy_rate_ctrl->magic_number = kModelMagicNumber; toy_rate_ctrl->coding_index = -1; - *rate_ctrl_model_pt = toy_rate_ctrl; + *rate_ctrl_model_ptr = toy_rate_ctrl; EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber)); EXPECT_EQ(ratectrl_config->frame_width, 352); EXPECT_EQ(ratectrl_config->frame_height, 288); @@ -48,6 +70,48 @@ vpx_rc_status_t rc_create_model(void *priv, return VPX_RC_OK; } +vpx_rc_status_t rc_create_model_gop(void *priv, + const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_ptr) { + ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; + if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; + toy_rate_ctrl->magic_number = kModelMagicNumber; + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + toy_rate_ctrl->show_index = 0; + toy_rate_ctrl->coding_index = 0; + *rate_ctrl_model_ptr = toy_rate_ctrl; + EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber)); + EXPECT_EQ(ratectrl_config->frame_width, 640); + EXPECT_EQ(ratectrl_config->frame_height, 360); + EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP); + EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000); + EXPECT_EQ(ratectrl_config->frame_rate_num, 30); + EXPECT_EQ(ratectrl_config->frame_rate_den, 1); + return VPX_RC_OK; +} + +vpx_rc_status_t rc_create_model_gop_short( + void *priv, const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_ptr) { + ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl; + if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR; + toy_rate_ctrl->magic_number = kModelMagicNumber; + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + toy_rate_ctrl->show_index = 0; + toy_rate_ctrl->coding_index = 0; + *rate_ctrl_model_ptr = toy_rate_ctrl; + EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber)); + EXPECT_EQ(ratectrl_config->frame_width, 352); + EXPECT_EQ(ratectrl_config->frame_height, 288); + EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort); + EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500); + EXPECT_EQ(ratectrl_config->frame_rate_num, 30); + EXPECT_EQ(ratectrl_config->frame_rate_den, 1); + return VPX_RC_OK; +} + vpx_rc_status_t rc_send_firstpass_stats( vpx_rc_model_t rate_ctrl_model, const vpx_rc_firstpass_stats_t *first_pass_stats) { @@ -61,6 +125,32 @@ vpx_rc_status_t rc_send_firstpass_stats( return VPX_RC_OK; } +vpx_rc_status_t rc_send_firstpass_stats_gop( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP); + for (int i = 0; i < first_pass_stats->num_frames; ++i) { + EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); + } + return VPX_RC_OK; +} + +vpx_rc_status_t rc_send_firstpass_stats_gop_short( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort); + for (int i = 0; i < first_pass_stats->num_frames; ++i) { + EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); + } + return VPX_RC_OK; +} + vpx_rc_status_t rc_get_encodeframe_decision( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *encode_frame_info, @@ -76,19 +166,17 @@ vpx_rc_status_t rc_get_encodeframe_decision( if (encode_frame_info->coding_index == 0) { EXPECT_EQ(encode_frame_info->show_index, 0); EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 0); // kRefFrameTypeLast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], 0); // kRefFrameTypePast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], 0); // kRefFrameTypeFuture - } - - if (encode_frame_info->coding_index == 1) { + } else if (encode_frame_info->coding_index == 1) { EXPECT_EQ(encode_frame_info->show_index, 4); EXPECT_EQ(encode_frame_info->gop_index, 1); - EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 1); // kRefFrameTypeLast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], @@ -97,19 +185,15 @@ vpx_rc_status_t rc_get_encodeframe_decision( 0); // kRefFrameTypeFuture EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], 0); // kRefFrameTypeLast - } - - if (encode_frame_info->coding_index >= 2 && - encode_frame_info->coding_index < 5) { + } else if (encode_frame_info->coding_index >= 2 && + encode_frame_info->coding_index < 5) { // In the first group of pictures, coding_index and gop_index are equal. EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index); - EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/); - } - - if (encode_frame_info->coding_index == 5) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + } else if (encode_frame_info->coding_index == 5) { EXPECT_EQ(encode_frame_info->show_index, 4); EXPECT_EQ(encode_frame_info->gop_index, 0); - EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], 1); // kRefFrameTypeLast EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], @@ -133,6 +217,388 @@ vpx_rc_status_t rc_get_encodeframe_decision( return VPX_RC_OK; } +vpx_rc_status_t rc_get_encodeframe_decision_gop( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + } else if (encode_frame_info->coding_index == 3 || + encode_frame_info->coding_index == 12 || + encode_frame_info->coding_index == 21) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); + EXPECT_EQ(encode_frame_info->gop_index, 1); + } else if (encode_frame_info->coding_index == 11 || + encode_frame_info->coding_index == 20 || + encode_frame_info->coding_index == 29) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); + EXPECT_EQ(encode_frame_info->gop_index, 0); + } else if (encode_frame_info->coding_index >= 30) { + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + } + + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision_gop_short( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 2); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 3) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); + } + + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 2); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 3) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 3); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 4) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2); + } + + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, + vpx_rc_encodeframe_decision_t *frame_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + if (encode_frame_info->coding_index == 0) { + EXPECT_EQ(encode_frame_info->show_index, 0); + EXPECT_EQ(encode_frame_info->gop_index, 0); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 1) { + EXPECT_EQ(encode_frame_info->show_index, 1); + EXPECT_EQ(encode_frame_info->gop_index, 1); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0], + 1); // kRefFrameTypeLast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1], + 0); // kRefFrameTypePast + EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2], + 0); // kRefFrameTypeFuture + EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0], + 0); // kRefFrameTypeLast + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 2) { + EXPECT_EQ(encode_frame_info->show_index, 2); + EXPECT_EQ(encode_frame_info->gop_index, 2); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } else if (encode_frame_info->coding_index == 3) { + EXPECT_EQ(encode_frame_info->show_index, 3); + EXPECT_EQ(encode_frame_info->gop_index, 3); + EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter); + EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1); + } + + // When the model recommends an invalid q, valid range [0, 255], + // the encoder will ignore it and use the default q selected + // by libvpx rate control strategy. + frame_decision->q_index = VPX_DEFAULT_Q; + frame_decision->max_frame_size = 0; + + toy_rate_ctrl->coding_index += 1; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model, + const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval); + EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = + VPXMIN(kFixedGOPSize, gop_info->frames_to_key); + gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + +// Test on a 4 frame video. +// Test a setting of 2 GOPs. +// The first GOP has 3 coding frames, no alt ref. +// The second GOP has 1 coding frame, no alt ref. +vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model, + const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1; + gop_decision->use_alt_ref = 0; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + +// Test on a 4 frame video. +// Test a setting of 2 GOPs. +// The first GOP has 4 coding frames. Use alt ref. +// The second GOP only contains the overlay frame of the first GOP's alt ref +// frame. +vpx_rc_status_t rc_get_gop_decision_short_overlay( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1; + gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + +// Test on a 4 frame video. +// Test a setting of 1 GOP. +// The GOP has 4 coding frames. Do not use alt ref. +vpx_rc_status_t rc_get_gop_decision_short_no_arf( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision) { + ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1); + EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval); + EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval); + EXPECT_EQ(gop_info->allow_alt_ref, 1); + if (gop_info->is_key_frame) { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + EXPECT_EQ(gop_info->frames_since_key, 0); + EXPECT_EQ(gop_info->gop_global_index, 0); + toy_rate_ctrl->gop_global_index = 0; + toy_rate_ctrl->frames_since_key = 0; + } else { + EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0); + } + EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index); + EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key); + EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index); + EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index); + + gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1; + gop_decision->use_alt_ref = 0; + toy_rate_ctrl->frames_since_key += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + toy_rate_ctrl->show_index += + gop_decision->gop_coding_frames - gop_decision->use_alt_ref; + ++toy_rate_ctrl->gop_global_index; + return VPX_RC_OK; +} + vpx_rc_status_t rc_update_encodeframe_result( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_result_t *encode_frame_result) { @@ -153,6 +619,43 @@ vpx_rc_status_t rc_update_encodeframe_result( return VPX_RC_OK; } +vpx_rc_status_t rc_update_encodeframe_result_gop( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_result_t *encode_frame_result) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + + const int64_t ref_pixel_count = 640 * 360 * 3 / 2; + EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); + return VPX_RC_OK; +} + +vpx_rc_status_t rc_update_encodeframe_result_gop_short( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_result_t *encode_frame_result) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + + const int64_t ref_pixel_count = 352 * 288 * 3 / 2; + EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count); + return VPX_RC_OK; +} + +vpx_rc_status_t rc_get_default_frame_rdmult( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) { + const ToyRateCtrl *toy_rate_ctrl = + static_cast<ToyRateCtrl *>(rate_ctrl_model); + EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); + EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort); + EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index); + + *rdmult = VPX_DEFAULT_RDMULT; + return VPX_RC_OK; +} + vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) { ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model); EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber); @@ -176,6 +679,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_QP; rc_funcs.create_model = rc_create_model; rc_funcs.send_firstpass_stats = rc_send_firstpass_stats; rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision; @@ -195,8 +699,266 @@ TEST_F(ExtRateCtrlTest, EncodeTest) { "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNum)); - ASSERT_NE(video.get(), nullptr); + ASSERT_NE(video, nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } +class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<int> { + protected: + ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOP() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + + vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop; + rc_funcs.get_gop_decision = rc_get_gop_decision; + rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOP, EncodeTest) { + cfg_.rc_target_bitrate = 4000; + cfg_.g_lag_in_frames = kMaxLagInFrames; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr<libvpx_test::VideoSource> video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0, + kFrameNumGOP)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<int> { + protected: + ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOPShort() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); + + vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short; + rc_funcs.get_gop_decision = rc_get_gop_decision_short; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr<libvpx_test::VideoSource> video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestGOPShortOverlay + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<int> { + protected: + ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOPShortOverlay() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); + + vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = + rc_get_encodeframe_decision_gop_short_overlay; + rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr<libvpx_test::VideoSource> video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestGOPShortNoARF + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<int> { + protected: + ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestGOPShortNoARF() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval); + encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval); + encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO); + + vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = + rc_get_encodeframe_decision_gop_short_no_arf; + rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } +}; + +TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + + std::unique_ptr<libvpx_test::VideoSource> video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {} + + ~ExtRateCtrlTestRdmult() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kTwoPassGood); + } + + void BeginPassHook(unsigned int) override { + psnr_ = 0.0; + nframes_ = 0; + } + + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + vpx_rc_funcs_t rc_funcs; + rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT; + rc_funcs.create_model = rc_create_model_gop_short; + rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short; + rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short; + rc_funcs.get_gop_decision = rc_get_gop_decision_short; + rc_funcs.update_encodeframe_result = + rc_update_encodeframe_result_gop_short; + rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult; + rc_funcs.delete_model = rc_delete_model; + rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber); + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + private: + double psnr_; + unsigned int nframes_; +}; + +TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) { + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = kMaxLagInFrames - 1; + cfg_.rc_end_usage = VPX_VBR; + init_flags_ = VPX_CODEC_USE_PSNR; + + std::unique_ptr<libvpx_test::VideoSource> video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, kPsnrThreshold); +} + } // namespace diff --git a/libvpx/test/vp9_quantize_test.cc b/libvpx/test/vp9_quantize_test.cc index ca1062a76..587cec692 100644 --- a/libvpx/test/vp9_quantize_test.cc +++ b/libvpx/test/vp9_quantize_test.cc @@ -67,6 +67,45 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan); } +void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, + int16_t *dequant, int16_t *round_fp, + int16_t *quant_fp) { + // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V. + constexpr int kMaxQRoundingFactorFp = 64; + + for (int j = 0; j < 2; j++) { + // The range is 4 to 1828 in the VP9 tables. + const int qlookup = rnd->RandRange(1825) + 4; + round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7; + quant_fp[j] = (1 << 16) / qlookup; + + // Values determined by deconstructing vp9_init_quantizer(). + // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y + // values or U/V values of any bit depth. This is because y_delta is not + // factored into the vp9_ac_quant() call. + zbin[j] = rnd->RandRange(1200); + + // round may be up to 685 for Y values or 914 for U/V. + round[j] = rnd->RandRange(914); + // quant ranges from 1 to -32703 + quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703; + // quant_shift goes up to 1 << 16. + quant_shift[j] = rnd->RandRange(16384); + // dequant maxes out at 1828 for all cases. + dequant[j] = rnd->RandRange(1828); + } + for (int j = 2; j < 8; j++) { + zbin[j] = zbin[1]; + round_fp[j] = round_fp[1]; + quant_fp[j] = quant_fp[1]; + round[j] = round[1]; + quant[j] = quant[1]; + quant_shift[j] = quant_shift[1]; + dequant[j] = dequant[1]; + } +} + class VP9QuantizeBase : public AbstractBench { public: VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp) @@ -148,6 +187,7 @@ class VP9QuantizeTest : public VP9QuantizeBase, protected: virtual void Run(); + void Speed(bool is_median); const QuantizeFunc quantize_op_; const QuantizeFunc ref_quantize_op_; }; @@ -159,6 +199,101 @@ void VP9QuantizeTest::Run() { scan_->iscan); } +void VP9QuantizeTest::Speed(bool is_median) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); + TX_SIZE starting_sz, ending_sz; + + if (max_size_ == 16) { + starting_sz = TX_4X4; + ending_sz = TX_16X16; + } else { + starting_sz = TX_32X32; + ending_sz = TX_32X32; + } + + for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { + // zbin > coeff, zbin < coeff. + for (int i = 0; i < 2; ++i) { + // TX_TYPE defines the scan order. That is not relevant to the speed test. + // Pick the first one. + const TX_TYPE tx_type = DCT_DCT; + count_ = (4 << sz) * (4 << sz); + scan_ = &vp9_scan_orders[sz][tx_type]; + + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + + if (i == 0) { + // When |coeff values| are less than zbin the results are 0. + int threshold = 100; + if (max_size_ == 32) { + // For 32x32, the threshold is halved. Double it to keep the values + // from clearing it. + threshold = 200; + } + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; + coeff_.Set(&rnd, -99, 99); + } else if (i == 1) { + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; + coeff_.Set(&rnd, -500, 500); + } + + const char *type = + (i == 0) ? "Bypass calculations " : "Full calculations "; + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); + char title[100]; + snprintf(title, sizeof(title), "%25s %8s ", type, block_size); + + if (is_median) { + RunNTimes(10000000 / count_); + PrintMedian(title); + } else { + Buffer<tran_low_t> ref_qcoeff = + Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer<tran_low_t> ref_dqcoeff = + Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t ref_eob = 0; + + const int kNumTests = 5000000; + vpx_usec_timer timer, simd_timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, + q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, + scan_->scan, scan_->iscan); + } + vpx_usec_timer_mark(&timer); + + vpx_usec_timer_start(&simd_timer); + for (int n = 0; n < kNumTests; ++n) { + quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, + scan_->scan, scan_->iscan); + } + vpx_usec_timer_mark(&simd_timer); + + const int elapsed_time = + static_cast<int>(vpx_usec_timer_elapsed(&timer)); + const int simd_elapsed_time = + static_cast<int>(vpx_usec_timer_elapsed(&simd_timer)); + printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title, + elapsed_time, simd_elapsed_time, + ((float)elapsed_time / simd_elapsed_time)); + } + } + } +} + // This quantizer compares the AC coefficients to the quantization step size to // determine if further multiplication operations are needed. // Based on vp9_quantize_fp_sse2(). @@ -254,45 +389,6 @@ void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1); } -void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, - int16_t *quant, int16_t *quant_shift, - int16_t *dequant, int16_t *round_fp, - int16_t *quant_fp) { - // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V. - const int max_qrounding_factor_fp = 64; - - for (int j = 0; j < 2; j++) { - // The range is 4 to 1828 in the VP9 tables. - const int qlookup = rnd->RandRange(1825) + 4; - round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7; - quant_fp[j] = (1 << 16) / qlookup; - - // Values determined by deconstructing vp9_init_quantizer(). - // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y - // values or U/V values of any bit depth. This is because y_delta is not - // factored into the vp9_ac_quant() call. - zbin[j] = rnd->RandRange(1200); - - // round may be up to 685 for Y values or 914 for U/V. - round[j] = rnd->RandRange(914); - // quant ranges from 1 to -32703 - quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703; - // quant_shift goes up to 1 << 16. - quant_shift[j] = rnd->RandRange(16384); - // dequant maxes out at 1828 for all cases. - dequant[j] = rnd->RandRange(1828); - } - for (int j = 2; j < 8; j++) { - zbin[j] = zbin[1]; - round_fp[j] = round_fp[1]; - quant_fp[j] = quant_fp[1]; - round[j] = round[1]; - quant[j] = quant[1]; - quant_shift[j] = quant_shift[1]; - dequant[j] = dequant[1]; - } -} - TEST_P(VP9QuantizeTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); ASSERT_TRUE(coeff_.Init()); @@ -403,60 +499,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) { } } -TEST_P(VP9QuantizeTest, DISABLED_Speed) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - ASSERT_TRUE(coeff_.Init()); - ASSERT_TRUE(qcoeff_.Init()); - ASSERT_TRUE(dqcoeff_.Init()); - TX_SIZE starting_sz, ending_sz; - - if (max_size_ == 16) { - starting_sz = TX_4X4; - ending_sz = TX_16X16; - } else { - starting_sz = TX_32X32; - ending_sz = TX_32X32; - } - - for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { - // zbin > coeff, zbin < coeff. - for (int i = 0; i < 2; ++i) { - // TX_TYPE defines the scan order. That is not relevant to the speed test. - // Pick the first one. - const TX_TYPE tx_type = DCT_DCT; - count_ = (4 << sz) * (4 << sz); - scan_ = &vp9_scan_orders[sz][tx_type]; - - GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, - quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, - quant_fp_ptr_); - - if (i == 0) { - // When |coeff values| are less than zbin the results are 0. - int threshold = 100; - if (max_size_ == 32) { - // For 32x32, the threshold is halved. Double it to keep the values - // from clearing it. - threshold = 200; - } - for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; - coeff_.Set(&rnd, -99, 99); - } else if (i == 1) { - for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; - coeff_.Set(&rnd, -500, 500); - } +TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); } - RunNTimes(10000000 / count_); - const char *type = - (i == 0) ? "Bypass calculations " : "Full calculations "; - char block_size[16]; - snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); - char title[100]; - snprintf(title, sizeof(title), "%25s %8s ", type, block_size); - PrintMedian(title); - } - } -} +TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); } using std::make_tuple; @@ -467,6 +512,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, false), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>, + &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true), make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, @@ -492,7 +539,6 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSE2 #if HAVE_SSSE3 -#if VPX_ARCH_X86_64 INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, @@ -506,16 +552,6 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>, &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32, true))); -#else -INSTANTIATE_TEST_SUITE_P( - SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false))); - -#endif // VPX_ARCH_X86_64 #endif // HAVE_SSSE3 #if HAVE_AVX @@ -529,14 +565,78 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + AVX2, VP9QuantizeTest, + ::testing::Values( + make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>, + &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_avx2>, + &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16, + true), + make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>, + &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12, + 32, true), + make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); +#else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>, &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, - 16, true))); + 16, true), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>, + &QuantFPWrapper<quantize_fp_32x32_nz_c>, + VPX_BITS_8, 32, true), + make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx2, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX2 #if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>, + &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>, + &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32, + true))); +#else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, @@ -550,6 +650,7 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>, &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32, true))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/test/vp9_ratectrl_rtc_test.cc b/libvpx/test/vp9_ratectrl_rtc_test.cc index b09a45bb7..1d1a78f43 100644 --- a/libvpx/test/vp9_ratectrl_rtc_test.cc +++ b/libvpx/test/vp9_ratectrl_rtc_test.cc @@ -26,7 +26,11 @@ namespace { const size_t kNumFrames = 300; -const int kTemporalId[4] = { 0, 2, 1, 2 }; +const int kTemporalId3Layer[4] = { 0, 2, 1, 2 }; +const int kTemporalId2Layer[2] = { 0, 1 }; +const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 }; +const int kTemporalRateAllocation2Layer[2] = { 60, 100 }; +const int kSpatialLayerBitrate[3] = { 200, 400, 1000 }; class RcInterfaceTest : public ::libvpx_test::EncoderTest, @@ -179,28 +183,73 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_SVC, 1); encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); } - - frame_params_.frame_type = video->frame() == 0 ? KEY_FRAME : INTER_FRAME; - if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) { - // Disable golden frame update. - frame_flags_ |= VP8_EFLAG_NO_UPD_GF; - frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; - } + frame_params_.frame_type = + video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; encoder_exit_ = video->frame() == kNumFrames; current_superframe_ = video->frame(); + if (dynamic_spatial_layers_ == 1) { + if (video->frame() == 100) { + // Go down to 2 spatial layers: set top SL to 0 bitrate. + // Update the encoder config. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8]; + cfg_.layer_target_bitrate[6] = 0; + cfg_.layer_target_bitrate[7] = 0; + cfg_.layer_target_bitrate[8] = 0; + encoder->Config(&cfg_); + // Update the RC config. + rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[8]; + rc_cfg_.layer_target_bitrate[6] = 0; + rc_cfg_.layer_target_bitrate[7] = 0; + rc_cfg_.layer_target_bitrate[8] = 0; + rc_api_->UpdateRateControl(rc_cfg_); + } else if (video->frame() == 200) { + // Go down to 1 spatial layer. + // Update the encoder config. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[5]; + cfg_.layer_target_bitrate[3] = 0; + cfg_.layer_target_bitrate[4] = 0; + cfg_.layer_target_bitrate[5] = 0; + encoder->Config(&cfg_); + // Update the RC config. + rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[5]; + rc_cfg_.layer_target_bitrate[3] = 0; + rc_cfg_.layer_target_bitrate[4] = 0; + rc_cfg_.layer_target_bitrate[5] = 0; + rc_api_->UpdateRateControl(rc_cfg_); + } else if (0 && video->frame() == 280) { + // TODO(marpan): Re-enable this going back up when issue is fixed. + // Go back up to 3 spatial layers. + // Update the encoder config: use the original bitrates. + SetEncoderConfigSvc(3, 3); + encoder->Config(&cfg_); + // Update the RC config. + SetRCConfigSvc(3, 3); + rc_api_->UpdateRateControl(rc_cfg_); + } + } } virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0; while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz); for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { - frame_params_.spatial_layer_id = sl; - frame_params_.temporal_layer_id = kTemporalId[current_superframe_ % 4]; - rc_api_->ComputeQP(frame_params_); - frame_params_.frame_type = INTER_FRAME; - rc_api_->PostEncodeUpdate(sizes_[sl]); + if (sizes_[sl] > 0) { + frame_params_.spatial_layer_id = sl; + if (rc_cfg_.ts_number_layers == 3) + frame_params_.temporal_layer_id = + kTemporalId3Layer[current_superframe_ % 4]; + else if (rc_cfg_.ts_number_layers == 2) + frame_params_.temporal_layer_id = + kTemporalId2Layer[current_superframe_ % 2]; + else + frame_params_.temporal_layer_id = 0; + rc_api_->ComputeQP(frame_params_); + frame_params_.frame_type = INTER_FRAME; + rc_api_->PostEncodeUpdate(sizes_[sl]); + } } } if (!encoder_exit_) { @@ -218,9 +267,37 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, const vpx_image_t * /*img2*/) {} void RunSvc() { - SetConfigSvc(); + dynamic_spatial_layers_ = 0; + SetRCConfigSvc(3, 3); + key_interval_ = 10000; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcPeriodicKey() { + dynamic_spatial_layers_ = 0; + SetRCConfigSvc(3, 3); + key_interval_ = 100; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcDynamicSpatial() { + dynamic_spatial_layers_ = 1; + SetRCConfigSvc(3, 3); + key_interval_ = 10000; rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); - SetEncoderSvc(); + SetEncoderConfigSvc(3, 3); ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, 720, 30, 1, 0, kNumFrames); @@ -256,30 +333,54 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, return VPX_CODEC_OK; } - void SetEncoderSvc() { - cfg_.ss_number_layers = 3; - cfg_.ts_number_layers = 3; + void SetEncoderConfigSvc(int number_spatial_layers, + int number_temporal_layers) { + cfg_.g_w = 1280; + cfg_.g_h = 720; + cfg_.ss_number_layers = number_spatial_layers; + cfg_.ts_number_layers = number_temporal_layers; cfg_.g_timebase.num = 1; cfg_.g_timebase.den = 30; - svc_params_.scaling_factor_num[0] = 72; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 144; - svc_params_.scaling_factor_den[1] = 288; - svc_params_.scaling_factor_num[2] = 288; - svc_params_.scaling_factor_den[2] = 288; + if (number_spatial_layers == 3) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 4; + svc_params_.scaling_factor_num[1] = 2; + svc_params_.scaling_factor_den[1] = 4; + svc_params_.scaling_factor_num[2] = 4; + svc_params_.scaling_factor_den[2] = 4; + } else if (number_spatial_layers == 2) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 2; + svc_params_.scaling_factor_num[1] = 2; + svc_params_.scaling_factor_den[1] = 2; + } else if (number_spatial_layers == 1) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 1; + } + for (int i = 0; i < VPX_MAX_LAYERS; ++i) { svc_params_.max_quantizers[i] = 56; svc_params_.min_quantizers[i] = 2; svc_params_.speed_per_layer[i] = 7; + svc_params_.loopfilter_ctrl[i] = LOOPFILTER_ALL; } cfg_.rc_end_usage = VPX_CBR; cfg_.g_lag_in_frames = 0; cfg_.g_error_resilient = 0; - // 3 temporal layers - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.temporal_layering_mode = 3; + + if (number_temporal_layers == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + } else if (number_temporal_layers == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.temporal_layering_mode = 2; + } else if (number_temporal_layers == 1) { + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + } cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 600; @@ -288,27 +389,39 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, cfg_.rc_max_quantizer = 56; cfg_.g_threads = 1; cfg_.kf_max_dist = 9999; - cfg_.rc_target_bitrate = 1600; cfg_.rc_overshoot_pct = 50; cfg_.rc_undershoot_pct = 50; - cfg_.layer_target_bitrate[0] = 100; - cfg_.layer_target_bitrate[1] = 140; - cfg_.layer_target_bitrate[2] = 200; - cfg_.layer_target_bitrate[3] = 250; - cfg_.layer_target_bitrate[4] = 350; - cfg_.layer_target_bitrate[5] = 500; - cfg_.layer_target_bitrate[6] = 450; - cfg_.layer_target_bitrate[7] = 630; - cfg_.layer_target_bitrate[8] = 900; + cfg_.rc_target_bitrate = 0; + for (int sl = 0; sl < number_spatial_layers; sl++) { + int spatial_bitrate = 0; + if (number_spatial_layers <= 3) + spatial_bitrate = kSpatialLayerBitrate[sl]; + for (int tl = 0; tl < number_temporal_layers; tl++) { + int layer = sl * number_temporal_layers + tl; + if (number_temporal_layers == 3) + cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 2) + cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 1) + cfg_.layer_target_bitrate[layer] = spatial_bitrate; + } + cfg_.rc_target_bitrate += spatial_bitrate; + } + + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; } - void SetConfigSvc() { + void SetRCConfigSvc(int number_spatial_layers, int number_temporal_layers) { rc_cfg_.width = 1280; rc_cfg_.height = 720; + rc_cfg_.ss_number_layers = number_spatial_layers; + rc_cfg_.ts_number_layers = number_temporal_layers; rc_cfg_.max_quantizer = 56; rc_cfg_.min_quantizer = 2; - rc_cfg_.target_bandwidth = 1600; rc_cfg_.buf_initial_sz = 500; rc_cfg_.buf_optimal_sz = 600; rc_cfg_.buf_sz = 1000; @@ -316,31 +429,55 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, rc_cfg_.overshoot_pct = 50; rc_cfg_.max_intra_bitrate_pct = 900; rc_cfg_.framerate = 30.0; - rc_cfg_.ss_number_layers = 3; - rc_cfg_.ts_number_layers = 3; rc_cfg_.rc_mode = VPX_CBR; rc_cfg_.aq_mode = aq_mode_; - rc_cfg_.scaling_factor_num[0] = 1; - rc_cfg_.scaling_factor_den[0] = 4; - rc_cfg_.scaling_factor_num[1] = 2; - rc_cfg_.scaling_factor_den[1] = 4; - rc_cfg_.scaling_factor_num[2] = 4; - rc_cfg_.scaling_factor_den[2] = 4; - - rc_cfg_.ts_rate_decimator[0] = 4; - rc_cfg_.ts_rate_decimator[1] = 2; - rc_cfg_.ts_rate_decimator[2] = 1; - - rc_cfg_.layer_target_bitrate[0] = 100; - rc_cfg_.layer_target_bitrate[1] = 140; - rc_cfg_.layer_target_bitrate[2] = 200; - rc_cfg_.layer_target_bitrate[3] = 250; - rc_cfg_.layer_target_bitrate[4] = 350; - rc_cfg_.layer_target_bitrate[5] = 500; - rc_cfg_.layer_target_bitrate[6] = 450; - rc_cfg_.layer_target_bitrate[7] = 630; - rc_cfg_.layer_target_bitrate[8] = 900; + if (number_spatial_layers == 3) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 4; + rc_cfg_.scaling_factor_num[1] = 2; + rc_cfg_.scaling_factor_den[1] = 4; + rc_cfg_.scaling_factor_num[2] = 4; + rc_cfg_.scaling_factor_den[2] = 4; + } else if (number_spatial_layers == 2) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 2; + rc_cfg_.scaling_factor_num[1] = 2; + rc_cfg_.scaling_factor_den[1] = 2; + } else if (number_spatial_layers == 1) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 1; + } + + if (number_temporal_layers == 3) { + rc_cfg_.ts_rate_decimator[0] = 4; + rc_cfg_.ts_rate_decimator[1] = 2; + rc_cfg_.ts_rate_decimator[2] = 1; + } else if (number_temporal_layers == 2) { + rc_cfg_.ts_rate_decimator[0] = 2; + rc_cfg_.ts_rate_decimator[1] = 1; + } else if (number_temporal_layers == 1) { + rc_cfg_.ts_rate_decimator[0] = 1; + } + + rc_cfg_.target_bandwidth = 0; + for (int sl = 0; sl < number_spatial_layers; sl++) { + int spatial_bitrate = 0; + if (number_spatial_layers <= 3) + spatial_bitrate = kSpatialLayerBitrate[sl]; + for (int tl = 0; tl < number_temporal_layers; tl++) { + int layer = sl * number_temporal_layers + tl; + if (number_temporal_layers == 3) + rc_cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 2) + rc_cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 1) + rc_cfg_.layer_target_bitrate[layer] = spatial_bitrate; + } + rc_cfg_.target_bandwidth += spatial_bitrate; + } for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) { for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) { @@ -359,6 +496,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, bool encoder_exit_; int current_superframe_; uint32_t sizes_[8]; + int key_interval_; + int dynamic_spatial_layers_; }; TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); } @@ -367,6 +506,10 @@ TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } +TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); } + +TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); } + VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3), ::testing::Values(VPX_CBR, VPX_VBR)); VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3)); diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc index 211cc6c7a..a57082f1e 100644 --- a/libvpx/test/vp9_subtract_test.cc +++ b/libvpx/test/vp9_subtract_test.cc @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include <tuple> #include "third_party/googletest/src/include/gtest/gtest.h" @@ -17,9 +18,11 @@ #include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" +#include "test/util.h" #include "vp9/common/vp9_blockd.h" #include "vpx_ports/msvc.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, @@ -133,6 +136,10 @@ INSTANTIATE_TEST_SUITE_P(C, VP9SubtractBlockTest, INSTANTIATE_TEST_SUITE_P(SSE2, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_sse2)); #endif +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_avx2)); +#endif #if HAVE_NEON INSTANTIATE_TEST_SUITE_P(NEON, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_neon)); @@ -157,4 +164,158 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_lsx)); #endif +#if CONFIG_VP9_HIGHBITDEPTH + +typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride, int bd); + +// <BLOCK_SIZE, bit_depth, optimized subtract func, reference subtract func> +using Params = std::tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc>; + +class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> { + public: + virtual void SetUp() { + block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)]; + block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)]; + bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(1)); + func_ = GET_PARAM(2); + ref_func_ = GET_PARAM(3); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + + constexpr size_t kMaxWidth = 128; + constexpr size_t kMaxBlockSize = kMaxWidth * kMaxWidth; + src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>( + vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t)))); + ASSERT_NE(src_, nullptr); + pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>( + vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t)))); + ASSERT_NE(pred_, nullptr); + diff_ = reinterpret_cast<int16_t *>( + vpx_memalign(16, kMaxBlockSize * sizeof(int16_t))); + ASSERT_NE(diff_, nullptr); + } + + virtual void TearDown() { + vpx_free(CONVERT_TO_SHORTPTR(src_)); + vpx_free(CONVERT_TO_SHORTPTR(pred_)); + vpx_free(diff_); + } + + protected: + void CheckResult(); + void RunForSpeed(); + + private: + ACMRandom rnd_; + int block_height_; + int block_width_; + vpx_bit_depth_t bit_depth_; + HBDSubtractFunc func_; + HBDSubtractFunc ref_func_; + uint8_t *src_; + uint8_t *pred_; + int16_t *diff_; +}; + +void VPXHBDSubtractBlockTest::CheckResult() { + constexpr int kTestNum = 100; + constexpr int kMaxWidth = 128; + constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth; + const int mask = (1 << bit_depth_) - 1; + for (int i = 0; i < kTestNum; ++i) { + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + func_(block_height_, block_width_, diff_, block_width_, src_, block_width_, + pred_, block_width_, bit_depth_); + + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ + c], + (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] - + CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c])) + << "r = " << r << ", c = " << c << ", test: " << i; + } + } + } +} + +TEST_P(VPXHBDSubtractBlockTest, CheckResult) { CheckResult(); } + +void VPXHBDSubtractBlockTest::RunForSpeed() { + constexpr int kTestNum = 200000; + constexpr int kMaxWidth = 128; + constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth; + const int mask = (1 << bit_depth_) - 1; + + if (ref_func_ == func_) GTEST_SKIP(); + + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + vpx_usec_timer ref_timer; + vpx_usec_timer_start(&ref_timer); + for (int i = 0; i < kTestNum; ++i) { + ref_func_(block_height_, block_width_, diff_, block_width_, src_, + block_width_, pred_, block_width_, bit_depth_); + } + vpx_usec_timer_mark(&ref_timer); + const int64_t ref_elapsed_time = vpx_usec_timer_elapsed(&ref_timer); + + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < kTestNum; ++i) { + func_(block_height_, block_width_, diff_, block_width_, src_, block_width_, + pred_, block_width_, bit_depth_); + } + vpx_usec_timer_mark(&timer); + const int64_t elapsed_time = vpx_usec_timer_elapsed(&timer); + + printf( + "[%dx%d]: " + "ref_time=%6" PRId64 " \t simd_time=%6" PRId64 + " \t " + "gain=%f \n", + block_width_, block_height_, ref_elapsed_time, elapsed_time, + static_cast<double>(ref_elapsed_time) / + static_cast<double>(elapsed_time)); +} + +TEST_P(VPXHBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); } + +const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, + BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, + BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, + BLOCK_64X64 }; + +INSTANTIATE_TEST_SUITE_P( + C, VPXHBDSubtractBlockTest, + ::testing::Combine(::testing::ValuesIn(kValidBlockSize), + ::testing::Values(12), + ::testing::Values(&vpx_highbd_subtract_block_c), + ::testing::Values(&vpx_highbd_subtract_block_c))); + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, VPXHBDSubtractBlockTest, + ::testing::Combine(::testing::ValuesIn(kValidBlockSize), + ::testing::Values(12), + ::testing::Values(&vpx_highbd_subtract_block_avx2), + ::testing::Values(&vpx_highbd_subtract_block_c))); +#endif // HAVE_AVX2 + +#endif // CONFIG_VP9_HIGHBITDEPTH } // namespace vp9 diff --git a/libvpx/third_party/googletest/README.libvpx b/libvpx/third_party/googletest/README.libvpx index b9a74922f..5f6b01b0e 100644 --- a/libvpx/third_party/googletest/README.libvpx +++ b/libvpx/third_party/googletest/README.libvpx @@ -1,5 +1,5 @@ URL: https://github.com/google/googletest.git -Version: release-1.11.0 +Version: release-1.12.1 License: BSD License File: LICENSE @@ -13,9 +13,17 @@ generation. Local Modifications: - Remove everything but: + .clang-format CONTRIBUTORS googletest/ include README.md src LICENSE +- Move .clang-format, CONTRIBUTORS, and LICENSE into googletest/ +- In googletest/include/gtest/internal/custom/gtest-port.h, define + GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix + the mingw32 g++ compilation errors caused by the lack of std::mutex + and std::condition_variable in the <mutex> and <condition_variable> + headers if mingw32 is configured with the win32 threads option. See + https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32 diff --git a/libvpx/third_party/googletest/src/.clang-format b/libvpx/third_party/googletest/src/.clang-format new file mode 100644 index 000000000..5b9bfe6d2 --- /dev/null +++ b/libvpx/third_party/googletest/src/.clang-format @@ -0,0 +1,4 @@ +# Run manually to reformat a file: +# clang-format -i --style=file <file> +Language: Cpp +BasedOnStyle: Google diff --git a/libvpx/third_party/googletest/src/CONTRIBUTORS b/libvpx/third_party/googletest/src/CONTRIBUTORS index 76db0b40f..77397a5b5 100644 --- a/libvpx/third_party/googletest/src/CONTRIBUTORS +++ b/libvpx/third_party/googletest/src/CONTRIBUTORS @@ -34,6 +34,7 @@ Manuel Klimek <klimek@google.com> Mario Tanev <radix@google.com> Mark Paskin Markus Heule <markus.heule@gmail.com> +Martijn Vels <mvels@google.com> Matthew Simmons <simmonmt@acm.org> Mika Raento <mikie@iki.fi> Mike Bland <mbland@google.com> @@ -55,6 +56,7 @@ Russ Rufer <russ@pentad.com> Sean Mcafee <eefacm@gmail.com> Sigurður Ásgeirsson <siggi@google.com> Sverre Sundsdal <sundsdal@gmail.com> +Szymon Sobik <sobik.szymon@gmail.com> Takeshi Yoshino <tyoshino@google.com> Tracy Bialik <tracy@pentad.com> Vadim Berman <vadimb@google.com> diff --git a/libvpx/third_party/googletest/src/README.md b/libvpx/third_party/googletest/src/README.md index 1f8b349ae..d26b309ed 100644 --- a/libvpx/third_party/googletest/src/README.md +++ b/libvpx/third_party/googletest/src/README.md @@ -25,7 +25,7 @@ When building GoogleTest as a standalone project, the typical workflow starts with ``` -git clone https://github.com/google/googletest.git -b release-1.10.0 +git clone https://github.com/google/googletest.git -b release-1.11.0 cd googletest # Main directory of the cloned repository. mkdir build # Create a directory to hold the build output. cd build @@ -94,7 +94,7 @@ include(FetchContent) FetchContent_Declare( googletest # Specify the commit you depend on and update it regularly. - URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip + URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip ) # For Windows: Prevent overriding the parent project's compiler/linker settings set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) @@ -203,7 +203,9 @@ add -DGTEST_DONT_DEFINE_FOO=1 to the compiler flags to tell GoogleTest to change the macro's name from `FOO` -to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For +to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`, +`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`, +`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write GTEST_TEST(SomeTest, DoesThis) { ... } diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h b/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h new file mode 100644 index 000000000..addbb59c6 --- /dev/null +++ b/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h @@ -0,0 +1,237 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements the AssertionResult type. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ + +#include <memory> +#include <ostream> +#include <string> +#include <type_traits> + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-port.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// A class for indicating whether an assertion was successful. When +// the assertion wasn't successful, the AssertionResult object +// remembers a non-empty message that describes how it failed. +// +// To create an instance of this class, use one of the factory functions +// (AssertionSuccess() and AssertionFailure()). +// +// This class is useful for two purposes: +// 1. Defining predicate functions to be used with Boolean test assertions +// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts +// 2. Defining predicate-format functions to be +// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). +// +// For example, if you define IsEven predicate: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) +// will print the message +// +// Value of: IsEven(Fib(5)) +// Actual: false (5 is odd) +// Expected: true +// +// instead of a more opaque +// +// Value of: IsEven(Fib(5)) +// Actual: false +// Expected: true +// +// in case IsEven is a simple Boolean predicate. +// +// If you expect your predicate to be reused and want to support informative +// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up +// about half as often as positive ones in our tests), supply messages for +// both success and failure cases: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess() << n << " is even"; +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print +// +// Value of: IsEven(Fib(6)) +// Actual: true (8 is even) +// Expected: false +// +// NB: Predicates that support negative Boolean assertions have reduced +// performance in positive ones so be careful not to use them in tests +// that have lots (tens of thousands) of positive Boolean assertions. +// +// To use this class with EXPECT_PRED_FORMAT assertions such as: +// +// // Verifies that Foo() returns an even number. +// EXPECT_PRED_FORMAT1(IsEven, Foo()); +// +// you need to define: +// +// testing::AssertionResult IsEven(const char* expr, int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() +// << "Expected: " << expr << " is even\n Actual: it's " << n; +// } +// +// If Foo() returns 5, you will see the following message: +// +// Expected: Foo() is even +// Actual: it's 5 +// +class GTEST_API_ AssertionResult { + public: + // Copy constructor. + // Used in EXPECT_TRUE/FALSE(assertion_result). + AssertionResult(const AssertionResult& other); + +// C4800 is a level 3 warning in Visual Studio 2015 and earlier. +// This warning is not emitted in Visual Studio 2017. +// This warning is off by default starting in Visual Studio 2019 but can be +// enabled with command-line options. +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) +#endif + + // Used in the EXPECT_TRUE/FALSE(bool_expression). + // + // T must be contextually convertible to bool. + // + // The second parameter prevents this overload from being considered if + // the argument is implicitly convertible to AssertionResult. In that case + // we want AssertionResult's copy constructor to be used. + template <typename T> + explicit AssertionResult( + const T& success, + typename std::enable_if< + !std::is_convertible<T, AssertionResult>::value>::type* + /*enabler*/ + = nullptr) + : success_(success) {} + +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) + GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + + // Assignment operator. + AssertionResult& operator=(AssertionResult other) { + swap(other); + return *this; + } + + // Returns true if and only if the assertion succeeded. + operator bool() const { return success_; } // NOLINT + + // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. + AssertionResult operator!() const; + + // Returns the text streamed into this AssertionResult. Test assertions + // use it when they fail (i.e., the predicate's outcome doesn't match the + // assertion's expectation). When nothing has been streamed into the + // object, returns an empty string. + const char* message() const { + return message_.get() != nullptr ? message_->c_str() : ""; + } + // Deprecated; please use message() instead. + const char* failure_message() const { return message(); } + + // Streams a custom failure message into this object. + template <typename T> + AssertionResult& operator<<(const T& value) { + AppendMessage(Message() << value); + return *this; + } + + // Allows streaming basic output manipulators such as endl or flush into + // this object. + AssertionResult& operator<<( + ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) { + AppendMessage(Message() << basic_manipulator); + return *this; + } + + private: + // Appends the contents of message to message_. + void AppendMessage(const Message& a_message) { + if (message_.get() == nullptr) message_.reset(new ::std::string); + message_->append(a_message.GetString().c_str()); + } + + // Swap the contents of this AssertionResult with other. + void swap(AssertionResult& other); + + // Stores result of the assertion predicate. + bool success_; + // Stores the message describing the condition in case the expectation + // construct is not satisfied with the predicate's outcome. + // Referenced via a pointer to avoid taking too much stack frame space + // with test assertions. + std::unique_ptr< ::std::string> message_; +}; + +// Makes a successful assertion result. +GTEST_API_ AssertionResult AssertionSuccess(); + +// Makes a failed assertion result. +GTEST_API_ AssertionResult AssertionFailure(); + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << msg. +GTEST_API_ AssertionResult AssertionFailure(const Message& msg); + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h b/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h index 9b4d4d133..84e5a5bbd 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h @@ -27,21 +27,21 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// // The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the public API for death tests. It is // #included by gtest.h so a user doesn't need to include this // directly. -// GOOGLETEST_CM0001 DO NOT DELETE + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ #include "gtest/internal/gtest-death-test-internal.h" -namespace testing { - // This flag controls the style of death tests. Valid values are "threadsafe", // meaning that the death test child process will re-execute the test binary // from the start, running only a single death test, or "fast", @@ -49,6 +49,8 @@ namespace testing { // after forking. GTEST_DECLARE_string_(death_test_style); +namespace testing { + #if GTEST_HAS_DEATH_TEST namespace internal { @@ -103,7 +105,6 @@ GTEST_API_ bool InDeathTestChild(); // // On the regular expressions used in death tests: // -// GOOGLETEST_CM0005 DO NOT DELETE // On POSIX-compliant systems (*nix), we use the <regex.h> library, // which uses the POSIX extended regex syntax. // @@ -169,24 +170,24 @@ GTEST_API_ bool InDeathTestChild(); // Asserts that a given `statement` causes the program to exit, with an // integer exit status that satisfies `predicate`, and emitting error output // that matches `matcher`. -# define ASSERT_EXIT(statement, predicate, matcher) \ - GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_) +#define ASSERT_EXIT(statement, predicate, matcher) \ + GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_) // Like `ASSERT_EXIT`, but continues on to successive tests in the // test suite, if any: -# define EXPECT_EXIT(statement, predicate, matcher) \ - GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_) +#define EXPECT_EXIT(statement, predicate, matcher) \ + GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_) // Asserts that a given `statement` causes the program to exit, either by // explicitly exiting with a nonzero exit code or being killed by a // signal, and emitting error output that matches `matcher`. -# define ASSERT_DEATH(statement, matcher) \ - ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) +#define ASSERT_DEATH(statement, matcher) \ + ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) // Like `ASSERT_DEATH`, but continues on to successive tests in the // test suite, if any: -# define EXPECT_DEATH(statement, matcher) \ - EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) +#define EXPECT_DEATH(statement, matcher) \ + EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: @@ -197,22 +198,23 @@ class GTEST_API_ ExitedWithCode { ExitedWithCode(const ExitedWithCode&) = default; void operator=(const ExitedWithCode& other) = delete; bool operator()(int exit_status) const; + private: const int exit_code_; }; -# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // Tests that an exit code describes an exit due to termination by a // given signal. -// GOOGLETEST_CM0006 DO NOT DELETE class GTEST_API_ KilledBySignal { public: explicit KilledBySignal(int signum); bool operator()(int exit_status) const; + private: const int signum_; }; -# endif // !GTEST_OS_WINDOWS +#endif // !GTEST_OS_WINDOWS // EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. // The death testing framework causes this to have interesting semantics, @@ -257,23 +259,21 @@ class GTEST_API_ KilledBySignal { // EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); // }, "death"); // -# ifdef NDEBUG +#ifdef NDEBUG -# define EXPECT_DEBUG_DEATH(statement, regex) \ +#define EXPECT_DEBUG_DEATH(statement, regex) \ GTEST_EXECUTE_STATEMENT_(statement, regex) -# define ASSERT_DEBUG_DEATH(statement, regex) \ +#define ASSERT_DEBUG_DEATH(statement, regex) \ GTEST_EXECUTE_STATEMENT_(statement, regex) -# else +#else -# define EXPECT_DEBUG_DEATH(statement, regex) \ - EXPECT_DEATH(statement, regex) +#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex) -# define ASSERT_DEBUG_DEATH(statement, regex) \ - ASSERT_DEATH(statement, regex) +#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex) -# endif // NDEBUG for EXPECT_DEBUG_DEATH +#endif // NDEBUG for EXPECT_DEBUG_DEATH #endif // GTEST_HAS_DEATH_TEST // This macro is used for implementing macros such as @@ -311,18 +311,17 @@ class GTEST_API_ KilledBySignal { // statement unconditionally returns or throws. The Message constructor at // the end allows the syntax of streaming additional messages into the // macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. -# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - GTEST_LOG_(WARNING) \ - << "Death tests are not supported on this platform.\n" \ - << "Statement '" #statement "' cannot be verified."; \ - } else if (::testing::internal::AlwaysFalse()) { \ - ::testing::internal::RE::PartialMatch(".*", (regex)); \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - terminator; \ - } else \ - ::testing::Message() +#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \ + << "Statement '" #statement "' cannot be verified."; \ + } else if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::RE::PartialMatch(".*", (regex)); \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + terminator; \ + } else \ + ::testing::Message() // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if @@ -330,15 +329,15 @@ class GTEST_API_ KilledBySignal { // useful when you are combining death test assertions with normal test // assertions in one test. #if GTEST_HAS_DEATH_TEST -# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ - EXPECT_DEATH(statement, regex) -# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ - ASSERT_DEATH(statement, regex) +#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + EXPECT_DEATH(statement, regex) +#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + ASSERT_DEATH(statement, regex) #else -# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ - GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, ) -# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ - GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return) +#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, ) +#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return) #endif } // namespace testing diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h b/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h index 9fa34a05b..bffa00c53 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h @@ -32,6 +32,10 @@ // This file implements just enough of the matcher interface to allow // EXPECT_DEATH and friends to accept a matcher argument. +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ @@ -98,11 +102,11 @@ class MatchResultListener { private: ::std::ostream* const stream_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener); + MatchResultListener(const MatchResultListener&) = delete; + MatchResultListener& operator=(const MatchResultListener&) = delete; }; -inline MatchResultListener::~MatchResultListener() { -} +inline MatchResultListener::~MatchResultListener() {} // An instance of a subclass of this knows how to describe itself as a // matcher. @@ -176,27 +180,39 @@ namespace internal { struct AnyEq { template <typename A, typename B> - bool operator()(const A& a, const B& b) const { return a == b; } + bool operator()(const A& a, const B& b) const { + return a == b; + } }; struct AnyNe { template <typename A, typename B> - bool operator()(const A& a, const B& b) const { return a != b; } + bool operator()(const A& a, const B& b) const { + return a != b; + } }; struct AnyLt { template <typename A, typename B> - bool operator()(const A& a, const B& b) const { return a < b; } + bool operator()(const A& a, const B& b) const { + return a < b; + } }; struct AnyGt { template <typename A, typename B> - bool operator()(const A& a, const B& b) const { return a > b; } + bool operator()(const A& a, const B& b) const { + return a > b; + } }; struct AnyLe { template <typename A, typename B> - bool operator()(const A& a, const B& b) const { return a <= b; } + bool operator()(const A& a, const B& b) const { + return a <= b; + } }; struct AnyGe { template <typename A, typename B> - bool operator()(const A& a, const B& b) const { return a >= b; } + bool operator()(const A& a, const B& b) const { + return a >= b; + } }; // A match result listener that ignores the explanation. @@ -205,7 +221,8 @@ class DummyMatchResultListener : public MatchResultListener { DummyMatchResultListener() : MatchResultListener(nullptr) {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener); + DummyMatchResultListener(const DummyMatchResultListener&) = delete; + DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete; }; // A match result listener that forwards the explanation to a given @@ -217,7 +234,9 @@ class StreamMatchResultListener : public MatchResultListener { : MatchResultListener(os) {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener); + StreamMatchResultListener(const StreamMatchResultListener&) = delete; + StreamMatchResultListener& operator=(const StreamMatchResultListener&) = + delete; }; struct SharedPayloadBase { @@ -284,17 +303,18 @@ class MatcherBase : private MatcherDescriberInterface { } protected: - MatcherBase() : vtable_(nullptr) {} + MatcherBase() : vtable_(nullptr), buffer_() {} // Constructs a matcher from its implementation. template <typename U> - explicit MatcherBase(const MatcherInterface<U>* impl) { + explicit MatcherBase(const MatcherInterface<U>* impl) + : vtable_(nullptr), buffer_() { Init(impl); } template <typename M, typename = typename std::remove_reference< M>::type::is_gtest_matcher> - MatcherBase(M&& m) { // NOLINT + MatcherBase(M&& m) : vtable_(nullptr), buffer_() { // NOLINT Init(std::forward<M>(m)); } @@ -420,8 +440,8 @@ class MatcherBase : private MatcherDescriberInterface { static const M& Get(const MatcherBase& m) { // When inlined along with Init, need to be explicit to avoid violating // strict aliasing rules. - const M *ptr = static_cast<const M*>( - static_cast<const void*>(&m.buffer_)); + const M* ptr = + static_cast<const M*>(static_cast<const void*>(&m.buffer_)); return *ptr; } static void Init(MatcherBase& m, M impl) { @@ -741,7 +761,7 @@ template <typename Rhs> class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> { public: explicit EqMatcher(const Rhs& rhs) - : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) { } + : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {} static const char* Desc() { return "is equal to"; } static const char* NegatedDesc() { return "isn't equal to"; } }; @@ -749,7 +769,7 @@ template <typename Rhs> class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> { public: explicit NeMatcher(const Rhs& rhs) - : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) { } + : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {} static const char* Desc() { return "isn't equal to"; } static const char* NegatedDesc() { return "is equal to"; } }; @@ -757,7 +777,7 @@ template <typename Rhs> class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> { public: explicit LtMatcher(const Rhs& rhs) - : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) { } + : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {} static const char* Desc() { return "is <"; } static const char* NegatedDesc() { return "isn't <"; } }; @@ -765,7 +785,7 @@ template <typename Rhs> class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> { public: explicit GtMatcher(const Rhs& rhs) - : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) { } + : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {} static const char* Desc() { return "is >"; } static const char* NegatedDesc() { return "isn't >"; } }; @@ -773,7 +793,7 @@ template <typename Rhs> class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> { public: explicit LeMatcher(const Rhs& rhs) - : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) { } + : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {} static const char* Desc() { return "is <="; } static const char* NegatedDesc() { return "isn't <="; } }; @@ -781,7 +801,7 @@ template <typename Rhs> class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> { public: explicit GeMatcher(const Rhs& rhs) - : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) { } + : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {} static const char* Desc() { return "is >="; } static const char* NegatedDesc() { return "isn't >="; } }; @@ -872,12 +892,16 @@ PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex( // Note: if the parameter of Eq() were declared as const T&, Eq("foo") // wouldn't compile. template <typename T> -inline internal::EqMatcher<T> Eq(T x) { return internal::EqMatcher<T>(x); } +inline internal::EqMatcher<T> Eq(T x) { + return internal::EqMatcher<T>(x); +} // Constructs a Matcher<T> from a 'value' of type T. The constructed // matcher matches any value that's equal to 'value'. template <typename T> -Matcher<T>::Matcher(T value) { *this = Eq(value); } +Matcher<T>::Matcher(T value) { + *this = Eq(value); +} // Creates a monomorphic matcher that matches anything with type Lhs // and equal to rhs. A user may need to use this instead of Eq(...) @@ -892,7 +916,9 @@ Matcher<T>::Matcher(T value) { *this = Eq(value); } // can always write Matcher<T>(Lt(5)) to be explicit about the type, // for example. template <typename Lhs, typename Rhs> -inline Matcher<Lhs> TypedEq(const Rhs& rhs) { return Eq(rhs); } +inline Matcher<Lhs> TypedEq(const Rhs& rhs) { + return Eq(rhs); +} // Creates a polymorphic matcher that matches anything >= x. template <typename Rhs> diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-message.h b/libvpx/third_party/googletest/src/include/gtest/gtest-message.h index becfd49fc..6c8bf9000 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest-message.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest-message.h @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// // The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the Message class. @@ -42,7 +41,9 @@ // to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user // program! -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ @@ -110,8 +111,8 @@ class GTEST_API_ Message { // Streams a non-pointer value to this object. template <typename T> - inline Message& operator <<(const T& val) { - // Some libraries overload << for STL containers. These + inline Message& operator<<(const T& val) { + // Some libraries overload << for STL containers. These // overloads are defined in the global namespace instead of ::std. // // C++'s symbol lookup rule (i.e. Koenig lookup) says that these @@ -125,7 +126,7 @@ class GTEST_API_ Message { // from the global namespace. With this using declaration, // overloads of << defined in the global namespace and those // visible via Koenig lookup are both exposed in this function. - using ::operator <<; + using ::operator<<; *ss_ << val; return *this; } @@ -144,7 +145,7 @@ class GTEST_API_ Message { // ensure consistent result across compilers, we always treat NULL // as "(null)". template <typename T> - inline Message& operator <<(T* const& pointer) { // NOLINT + inline Message& operator<<(T* const& pointer) { // NOLINT if (pointer == nullptr) { *ss_ << "(null)"; } else { @@ -159,25 +160,23 @@ class GTEST_API_ Message { // templatized version above. Without this definition, streaming // endl or other basic IO manipulators to Message will confuse the // compiler. - Message& operator <<(BasicNarrowIoManip val) { + Message& operator<<(BasicNarrowIoManip val) { *ss_ << val; return *this; } // Instead of 1/0, we want to see true/false for bool values. - Message& operator <<(bool b) { - return *this << (b ? "true" : "false"); - } + Message& operator<<(bool b) { return *this << (b ? "true" : "false"); } // These two overloads allow streaming a wide C string to a Message // using the UTF-8 encoding. - Message& operator <<(const wchar_t* wide_c_str); - Message& operator <<(wchar_t* wide_c_str); + Message& operator<<(const wchar_t* wide_c_str); + Message& operator<<(wchar_t* wide_c_str); #if GTEST_HAS_STD_WSTRING // Converts the given wide string to a narrow string using the UTF-8 // encoding, and streams the result to this Message object. - Message& operator <<(const ::std::wstring& wstr); + Message& operator<<(const ::std::wstring& wstr); #endif // GTEST_HAS_STD_WSTRING // Gets the text streamed to this object so far as an std::string. @@ -196,7 +195,7 @@ class GTEST_API_ Message { }; // Streams a Message to an ostream. -inline std::ostream& operator <<(std::ostream& os, const Message& sb) { +inline std::ostream& operator<<(std::ostream& os, const Message& sb) { return os << sb.GetString(); } diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h b/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h index 804e70281..b55119ac6 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h @@ -26,11 +26,14 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // Macros and functions for implementing parameterized tests // in Google C++ Testing and Mocking Framework (Google Test) -// -// GOOGLETEST_CM0001 DO NOT DELETE + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ @@ -353,9 +356,7 @@ internal::ValueArray<T...> Values(T... v) { // } // INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool()); // -inline internal::ParamGenerator<bool> Bool() { - return Values(false, true); -} +inline internal::ParamGenerator<bool> Bool() { return Values(false, true); } // Combine() allows the user to combine two or more sequences to produce // values of a Cartesian product of those sequences' elements. @@ -428,8 +429,11 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) { return 0; \ } \ static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ - test_name)); \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &) = delete; /* NOLINT */ \ }; \ int GTEST_TEST_CLASS_NAME_(test_suite_name, \ test_name)::gtest_registering_dummy_ = \ @@ -453,43 +457,42 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) { #define GTEST_GET_FIRST_(first, ...) first #define GTEST_GET_SECOND_(first, second, ...) second -#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \ - static ::testing::internal::ParamGenerator<test_suite_name::ParamType> \ - gtest_##prefix##test_suite_name##_EvalGenerator_() { \ - return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \ - } \ - static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \ - const ::testing::TestParamInfo<test_suite_name::ParamType>& info) { \ - if (::testing::internal::AlwaysFalse()) { \ - ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \ - __VA_ARGS__, \ - ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \ - DUMMY_PARAM_))); \ - auto t = std::make_tuple(__VA_ARGS__); \ - static_assert(std::tuple_size<decltype(t)>::value <= 2, \ - "Too Many Args!"); \ - } \ - return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \ - __VA_ARGS__, \ - ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \ - DUMMY_PARAM_))))(info); \ - } \ - static int gtest_##prefix##test_suite_name##_dummy_ \ - GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::UnitTest::GetInstance() \ - ->parameterized_test_registry() \ - .GetTestSuitePatternHolder<test_suite_name>( \ - GTEST_STRINGIFY_(test_suite_name), \ - ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ - ->AddTestSuiteInstantiation( \ - GTEST_STRINGIFY_(prefix), \ - >est_##prefix##test_suite_name##_EvalGenerator_, \ - >est_##prefix##test_suite_name##_EvalGenerateName_, \ +#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \ + static ::testing::internal::ParamGenerator<test_suite_name::ParamType> \ + gtest_##prefix##test_suite_name##_EvalGenerator_() { \ + return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \ + } \ + static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \ + const ::testing::TestParamInfo<test_suite_name::ParamType>& info) { \ + if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \ + DUMMY_PARAM_))); \ + auto t = std::make_tuple(__VA_ARGS__); \ + static_assert(std::tuple_size<decltype(t)>::value <= 2, \ + "Too Many Args!"); \ + } \ + return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \ + DUMMY_PARAM_))))(info); \ + } \ + static int gtest_##prefix##test_suite_name##_dummy_ \ + GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::UnitTest::GetInstance() \ + ->parameterized_test_registry() \ + .GetTestSuitePatternHolder<test_suite_name>( \ + GTEST_STRINGIFY_(test_suite_name), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ + ->AddTestSuiteInstantiation( \ + GTEST_STRINGIFY_(prefix), \ + >est_##prefix##test_suite_name##_EvalGenerator_, \ + >est_##prefix##test_suite_name##_EvalGenerateName_, \ __FILE__, __LINE__) - // Allow Marking a Parameterized test class as not needing to be instantiated. -#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \ +#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \ namespace gtest_do_not_use_outside_namespace_scope {} \ static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \ GTEST_STRINGIFY_(T)) diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h b/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h index 076c9de1f..a91e8b8b1 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - // Google Test - The Google C++ Testing and Mocking Framework // // This file implements a universal value printer that can print a @@ -95,7 +94,9 @@ // being defined as many user-defined container types don't have // value_type. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ @@ -257,12 +258,10 @@ struct ConvertibleToStringViewPrinter { #endif }; - // Prints the given number of bytes in the given object to the given // ostream. GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, - size_t count, - ::std::ostream* os); + size_t count, ::std::ostream* os); struct RawBytesPrinter { // SFINAE on `sizeof` to make sure we have a complete type. template <typename T, size_t = sizeof(T)> @@ -360,7 +359,7 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); -#ifdef __cpp_char8_t +#ifdef __cpp_lib_char8_t GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t); GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t); #endif @@ -375,12 +374,12 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t); // to point to a NUL-terminated string, and thus can print it as a string. #define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \ - template <> \ - class FormatForComparison<CharType*, OtherStringType> { \ - public: \ - static ::std::string Format(CharType* value) { \ - return ::testing::PrintToString(value); \ - } \ + template <> \ + class FormatForComparison<CharType*, OtherStringType> { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(value); \ + } \ } GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); @@ -410,8 +409,8 @@ GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. template <typename T1, typename T2> -std::string FormatForComparisonFailureMessage( - const T1& value, const T2& /* other_operand */) { +std::string FormatForComparisonFailureMessage(const T1& value, + const T2& /* other_operand */) { return FormatForComparison<T1, T2>::Format(value); } @@ -479,6 +478,12 @@ inline void PrintTo(char8_t c, ::std::ostream* os) { } #endif +// gcc/clang __{u,}int128_t +#if defined(__SIZEOF_INT128__) +GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os); +GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os); +#endif // __SIZEOF_INT128__ + // Overloads for C strings. GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); inline void PrintTo(char* s, ::std::ostream* os) { @@ -545,7 +550,7 @@ void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { } // Overloads for ::std::string. -GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os); +GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os); inline void PrintTo(const ::std::string& s, ::std::ostream* os) { PrintStringTo(s, os); } @@ -572,7 +577,7 @@ inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) { // Overloads for ::std::wstring. #if GTEST_HAS_STD_WSTRING -GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os); +GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os); inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { PrintWideStringTo(s, os); } @@ -587,6 +592,12 @@ inline void PrintTo(internal::StringView sp, ::std::ostream* os) { inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; } +#if GTEST_HAS_RTTI +inline void PrintTo(const std::type_info& info, std::ostream* os) { + *os << internal::GetTypeName(info); +} +#endif // GTEST_HAS_RTTI + template <typename T> void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) { UniversalPrinter<T&>::Print(ref.get(), os); @@ -744,6 +755,14 @@ class UniversalPrinter<Optional<T>> { } }; +template <> +class UniversalPrinter<decltype(Nullopt())> { + public: + static void Print(decltype(Nullopt()), ::std::ostream* os) { + *os << "(nullopt)"; + } +}; + #endif // GTEST_INTERNAL_HAS_OPTIONAL #if GTEST_INTERNAL_HAS_VARIANT @@ -802,8 +821,8 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { } } // This overload prints a (const) char array compactly. -GTEST_API_ void UniversalPrintArray( - const char* begin, size_t len, ::std::ostream* os); +GTEST_API_ void UniversalPrintArray(const char* begin, size_t len, + ::std::ostream* os); #ifdef __cpp_char8_t // This overload prints a (const) char8_t array compactly. @@ -820,8 +839,8 @@ GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len, ::std::ostream* os); // This overload prints a (const) wchar_t array compactly. -GTEST_API_ void UniversalPrintArray( - const wchar_t* begin, size_t len, ::std::ostream* os); +GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len, + ::std::ostream* os); // Implements printing an array type T[N]. template <typename T, size_t N> @@ -980,10 +999,10 @@ void UniversalPrint(const T& value, ::std::ostream* os) { UniversalPrinter<T1>::Print(value, os); } -typedef ::std::vector< ::std::string> Strings; +typedef ::std::vector<::std::string> Strings; - // Tersely prints the first N fields of a tuple to a string vector, - // one element for each field. +// Tersely prints the first N fields of a tuple to a string vector, +// one element for each field. template <typename Tuple> void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>, Strings*) {} diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h b/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h index eacef4466..bec8c4810 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h @@ -27,12 +27,9 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// // Utilities for testing Google Test itself and code that uses Google Test // (e.g. frameworks built on top of Google Test). -// GOOGLETEST_CM0004 DO NOT DELETE - #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ @@ -88,7 +85,10 @@ class GTEST_API_ ScopedFakeTestPartResultReporter TestPartResultReporterInterface* old_reporter_; TestPartResultArray* const result_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter); + ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) = + delete; + ScopedFakeTestPartResultReporter& operator=( + const ScopedFakeTestPartResultReporter&) = delete; }; namespace internal { @@ -104,12 +104,14 @@ class GTEST_API_ SingleFailureChecker { SingleFailureChecker(const TestPartResultArray* results, TestPartResult::Type type, const std::string& substr); ~SingleFailureChecker(); + private: const TestPartResultArray* const results_; const TestPartResult::Type type_; const std::string substr_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker); + SingleFailureChecker(const SingleFailureChecker&) = delete; + SingleFailureChecker& operator=(const SingleFailureChecker&) = delete; }; } // namespace internal @@ -119,7 +121,8 @@ class GTEST_API_ SingleFailureChecker { GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 // A set of macros for testing Google Test assertions or code that's expected -// to generate Google Test fatal failures. It verifies that the given +// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but +// not a non-fatal failure, as from EXPECT_EQ). It verifies that the given // statement will cause exactly one fatal Google Test failure with 'substr' // being part of the failure message. // @@ -141,44 +144,46 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 // helper macro, due to some peculiarity in how the preprocessor // works. The AcceptsMacroThatExpandsToUnprotectedComma test in // gtest_unittest.cc will fail to compile if we do that. -#define EXPECT_FATAL_FAILURE(statement, substr) \ - do { \ - class GTestExpectFatalFailureHelper {\ - public:\ - static void Execute() { statement; }\ - };\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ - GTestExpectFatalFailureHelper::Execute();\ - }\ +#define EXPECT_FATAL_FAILURE(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper { \ + public: \ + static void Execute() { statement; } \ + }; \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, \ + >est_failures); \ + GTestExpectFatalFailureHelper::Execute(); \ + } \ } while (::testing::internal::AlwaysFalse()) -#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ - do { \ - class GTestExpectFatalFailureHelper {\ - public:\ - static void Execute() { statement; }\ - };\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ALL_THREADS, >est_failures);\ - GTestExpectFatalFailureHelper::Execute();\ - }\ +#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper { \ + public: \ + static void Execute() { statement; } \ + }; \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ + >est_failures); \ + GTestExpectFatalFailureHelper::Execute(); \ + } \ } while (::testing::internal::AlwaysFalse()) // A macro for testing Google Test assertions or code that's expected to -// generate Google Test non-fatal failures. It asserts that the given -// statement will cause exactly one non-fatal Google Test failure with 'substr' -// being part of the failure message. +// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ, +// but not from an ASSERT_EQ). It asserts that the given statement will cause +// exactly one non-fatal Google Test failure with 'substr' being part of the +// failure message. // // There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only // affects and considers failures generated in the current thread and @@ -207,32 +212,37 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 // instead of // GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) // to avoid an MSVC warning on unreachable code. -#define EXPECT_NONFATAL_FAILURE(statement, substr) \ - do {\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ +#define EXPECT_NONFATAL_FAILURE(statement, substr) \ + do { \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ - (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ - if (::testing::internal::AlwaysTrue()) { statement; }\ - }\ + (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, \ + >est_failures); \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } \ + } \ } while (::testing::internal::AlwaysFalse()) -#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ - do {\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ - (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ +#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ - >est_failures);\ - if (::testing::internal::AlwaysTrue()) { statement; }\ - }\ + >est_failures); \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } \ + } \ } while (::testing::internal::AlwaysFalse()) #endif // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h b/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h index 203fdf98c..09cc8c34f 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h @@ -26,14 +26,17 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// GOOGLETEST_CM0001 DO NOT DELETE + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ #include <iosfwd> #include <vector> + #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-string.h" @@ -142,7 +145,8 @@ class GTEST_API_ TestPartResultArray { private: std::vector<TestPartResult> array_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray); + TestPartResultArray(const TestPartResultArray&) = delete; + TestPartResultArray& operator=(const TestPartResultArray&) = delete; }; // This interface knows how to report a test part result. @@ -168,11 +172,13 @@ class GTEST_API_ HasNewFatalFailureHelper ~HasNewFatalFailureHelper() override; void ReportTestPartResult(const TestPartResult& result) override; bool has_new_fatal_failure() const { return has_new_fatal_failure_; } + private: bool has_new_fatal_failure_; TestPartResultReporterInterface* original_reporter_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper); + HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete; + HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete; }; } // namespace internal diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h b/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h index 9fdc6be10..bd35a3266 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h @@ -27,7 +27,9 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ @@ -190,7 +192,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); typedef ::testing::internal::GenerateTypeList<Types>::type \ GTEST_TYPE_PARAMS_(CaseName); \ typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ - GTEST_NAME_GENERATOR_(CaseName) + GTEST_NAME_GENERATOR_(CaseName) #define TYPED_TEST(CaseName, TestName) \ static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1, \ @@ -256,7 +258,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); // #included in multiple translation units linked together. #define TYPED_TEST_SUITE_P(SuiteName) \ static ::testing::internal::TypedTestSuitePState \ - GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName) + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName) // Legacy API is deprecated but still available #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ @@ -301,21 +303,21 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); REGISTER_TYPED_TEST_SUITE_P #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ -#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \ - static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \ - "test-suit-prefix must not be empty"); \ - static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::internal::TypeParameterizedTestSuite< \ - SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \ - ::testing::internal::GenerateTypeList<Types>::type>:: \ - Register(GTEST_STRINGIFY_(Prefix), \ - ::testing::internal::CodeLocation(__FILE__, __LINE__), \ - >EST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \ - GTEST_STRINGIFY_(SuiteName), \ - GTEST_REGISTERED_TEST_NAMES_(SuiteName), \ - ::testing::internal::GenerateNames< \ - ::testing::internal::NameGeneratorSelector< \ - __VA_ARGS__>::type, \ +#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \ + static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \ + "test-suit-prefix must not be empty"); \ + static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestSuite< \ + SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \ + ::testing::internal::GenerateTypeList<Types>::type>:: \ + Register(GTEST_STRINGIFY_(Prefix), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + >EST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \ + GTEST_STRINGIFY_(SuiteName), \ + GTEST_REGISTERED_TEST_NAMES_(SuiteName), \ + ::testing::internal::GenerateNames< \ + ::testing::internal::NameGeneratorSelector< \ + __VA_ARGS__>::type, \ ::testing::internal::GenerateTypeList<Types>::type>()) // Legacy API is deprecated but still available diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest.h b/libvpx/third_party/googletest/src/include/gtest/gtest.h index 7a5d057c4..d19a587a1 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest.h @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// // The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the public API for Google Test. It should be @@ -47,8 +46,6 @@ // registration from Barthelemy Dagenais' (barthelemy@prologique.com) // easyUnit framework. -// GOOGLETEST_CM0001 DO NOT DELETE - #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_H_ @@ -59,31 +56,22 @@ #include <type_traits> #include <vector> -#include "gtest/internal/gtest-internal.h" -#include "gtest/internal/gtest-string.h" +#include "gtest/gtest-assertion-result.h" #include "gtest/gtest-death-test.h" #include "gtest/gtest-matchers.h" #include "gtest/gtest-message.h" #include "gtest/gtest-param-test.h" #include "gtest/gtest-printers.h" -#include "gtest/gtest_prod.h" #include "gtest/gtest-test-part.h" #include "gtest/gtest-typed-test.h" +#include "gtest/gtest_pred_impl.h" +#include "gtest/gtest_prod.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ /* class A needs to have dll-interface to be used by clients of class B */) -namespace testing { - -// Silence C4100 (unreferenced formal parameter) and 4805 -// unsafe mix of type 'const int' and type 'const bool' -#ifdef _MSC_VER -# pragma warning(push) -# pragma warning(disable:4805) -# pragma warning(disable:4100) -#endif - - // Declares the flags. // This flag temporary enables the disabled tests. @@ -138,6 +126,12 @@ GTEST_DECLARE_int32_(random_seed); // is 1. If the value is -1 the tests are repeating forever. GTEST_DECLARE_int32_(repeat); +// This flag controls whether Google Test Environments are recreated for each +// repeat of the tests. The default value is true. If set to false the global +// test Environment objects are only set up once, for the first iteration, and +// only torn down once, for the last. +GTEST_DECLARE_bool_(recreate_environments_when_repeating); + // This flag controls whether Google Test includes Google Test internal // stack frames in failure stack traces. GTEST_DECLARE_bool_(show_internal_stack_frames); @@ -163,6 +157,16 @@ GTEST_DECLARE_string_(stream_result_to); GTEST_DECLARE_string_(flagfile); #endif // GTEST_USE_OWN_FLAGFILE_FLAG_ +namespace testing { + +// Silence C4100 (unreferenced formal parameter) and 4805 +// unsafe mix of type 'const int' and type 'const bool' +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4805) +#pragma warning(disable : 4100) +#endif + // The upper limit for valid stack trace depths. const int kMaxStackTraceDepth = 100; @@ -201,193 +205,6 @@ using TestCase = TestSuite; class TestInfo; class UnitTest; -// A class for indicating whether an assertion was successful. When -// the assertion wasn't successful, the AssertionResult object -// remembers a non-empty message that describes how it failed. -// -// To create an instance of this class, use one of the factory functions -// (AssertionSuccess() and AssertionFailure()). -// -// This class is useful for two purposes: -// 1. Defining predicate functions to be used with Boolean test assertions -// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts -// 2. Defining predicate-format functions to be -// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). -// -// For example, if you define IsEven predicate: -// -// testing::AssertionResult IsEven(int n) { -// if ((n % 2) == 0) -// return testing::AssertionSuccess(); -// else -// return testing::AssertionFailure() << n << " is odd"; -// } -// -// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) -// will print the message -// -// Value of: IsEven(Fib(5)) -// Actual: false (5 is odd) -// Expected: true -// -// instead of a more opaque -// -// Value of: IsEven(Fib(5)) -// Actual: false -// Expected: true -// -// in case IsEven is a simple Boolean predicate. -// -// If you expect your predicate to be reused and want to support informative -// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up -// about half as often as positive ones in our tests), supply messages for -// both success and failure cases: -// -// testing::AssertionResult IsEven(int n) { -// if ((n % 2) == 0) -// return testing::AssertionSuccess() << n << " is even"; -// else -// return testing::AssertionFailure() << n << " is odd"; -// } -// -// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print -// -// Value of: IsEven(Fib(6)) -// Actual: true (8 is even) -// Expected: false -// -// NB: Predicates that support negative Boolean assertions have reduced -// performance in positive ones so be careful not to use them in tests -// that have lots (tens of thousands) of positive Boolean assertions. -// -// To use this class with EXPECT_PRED_FORMAT assertions such as: -// -// // Verifies that Foo() returns an even number. -// EXPECT_PRED_FORMAT1(IsEven, Foo()); -// -// you need to define: -// -// testing::AssertionResult IsEven(const char* expr, int n) { -// if ((n % 2) == 0) -// return testing::AssertionSuccess(); -// else -// return testing::AssertionFailure() -// << "Expected: " << expr << " is even\n Actual: it's " << n; -// } -// -// If Foo() returns 5, you will see the following message: -// -// Expected: Foo() is even -// Actual: it's 5 -// -class GTEST_API_ AssertionResult { - public: - // Copy constructor. - // Used in EXPECT_TRUE/FALSE(assertion_result). - AssertionResult(const AssertionResult& other); - -// C4800 is a level 3 warning in Visual Studio 2015 and earlier. -// This warning is not emitted in Visual Studio 2017. -// This warning is off by default starting in Visual Studio 2019 but can be -// enabled with command-line options. -#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) - GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) -#endif - - // Used in the EXPECT_TRUE/FALSE(bool_expression). - // - // T must be contextually convertible to bool. - // - // The second parameter prevents this overload from being considered if - // the argument is implicitly convertible to AssertionResult. In that case - // we want AssertionResult's copy constructor to be used. - template <typename T> - explicit AssertionResult( - const T& success, - typename std::enable_if< - !std::is_convertible<T, AssertionResult>::value>::type* - /*enabler*/ - = nullptr) - : success_(success) {} - -#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) - GTEST_DISABLE_MSC_WARNINGS_POP_() -#endif - - // Assignment operator. - AssertionResult& operator=(AssertionResult other) { - swap(other); - return *this; - } - - // Returns true if and only if the assertion succeeded. - operator bool() const { return success_; } // NOLINT - - // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. - AssertionResult operator!() const; - - // Returns the text streamed into this AssertionResult. Test assertions - // use it when they fail (i.e., the predicate's outcome doesn't match the - // assertion's expectation). When nothing has been streamed into the - // object, returns an empty string. - const char* message() const { - return message_.get() != nullptr ? message_->c_str() : ""; - } - // Deprecated; please use message() instead. - const char* failure_message() const { return message(); } - - // Streams a custom failure message into this object. - template <typename T> AssertionResult& operator<<(const T& value) { - AppendMessage(Message() << value); - return *this; - } - - // Allows streaming basic output manipulators such as endl or flush into - // this object. - AssertionResult& operator<<( - ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) { - AppendMessage(Message() << basic_manipulator); - return *this; - } - - private: - // Appends the contents of message to message_. - void AppendMessage(const Message& a_message) { - if (message_.get() == nullptr) message_.reset(new ::std::string); - message_->append(a_message.GetString().c_str()); - } - - // Swap the contents of this AssertionResult with other. - void swap(AssertionResult& other); - - // Stores result of the assertion predicate. - bool success_; - // Stores the message describing the condition in case the expectation - // construct is not satisfied with the predicate's outcome. - // Referenced via a pointer to avoid taking too much stack frame space - // with test assertions. - std::unique_ptr< ::std::string> message_; -}; - -// Makes a successful assertion result. -GTEST_API_ AssertionResult AssertionSuccess(); - -// Makes a failed assertion result. -GTEST_API_ AssertionResult AssertionFailure(); - -// Makes a failed assertion result with the given failure message. -// Deprecated; use AssertionFailure() << msg. -GTEST_API_ AssertionResult AssertionFailure(const Message& msg); - -} // namespace testing - -// Includes the auto-generated header that implements a family of generic -// predicate assertion macros. This include comes late because it relies on -// APIs declared above. -#include "gtest/gtest_pred_impl.h" - -namespace testing { - // The abstract class that all tests inherit from. // // In Google Test, a unit test program contains one or many TestSuites, and @@ -522,7 +339,8 @@ class GTEST_API_ Test { virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; } // We disallow copying Tests. - GTEST_DISALLOW_COPY_AND_ASSIGN_(Test); + Test(const Test&) = delete; + Test& operator=(const Test&) = delete; }; typedef internal::TimeInMillis TimeInMillis; @@ -536,24 +354,17 @@ class TestProperty { // C'tor. TestProperty does NOT have a default constructor. // Always use this constructor (with parameters) to create a // TestProperty object. - TestProperty(const std::string& a_key, const std::string& a_value) : - key_(a_key), value_(a_value) { - } + TestProperty(const std::string& a_key, const std::string& a_value) + : key_(a_key), value_(a_value) {} // Gets the user supplied key. - const char* key() const { - return key_.c_str(); - } + const char* key() const { return key_.c_str(); } // Gets the user supplied value. - const char* value() const { - return value_.c_str(); - } + const char* value() const { return value_.c_str(); } // Sets a new value, overriding the one supplied in the constructor. - void SetValue(const std::string& new_value) { - value_ = new_value; - } + void SetValue(const std::string& new_value) { value_ = new_value; } private: // The key supplied by the user. @@ -687,7 +498,8 @@ class GTEST_API_ TestResult { TimeInMillis elapsed_time_; // We disallow copying TestResult. - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult); + TestResult(const TestResult&) = delete; + TestResult& operator=(const TestResult&) = delete; }; // class TestResult // A TestInfo object stores the following information about a test: @@ -811,8 +623,8 @@ class GTEST_API_ TestInfo { } // These fields are immutable properties of the test. - const std::string test_suite_name_; // test suite name - const std::string name_; // Test name + const std::string test_suite_name_; // test suite name + const std::string name_; // Test name // Name of the parameter type, or NULL if this is not a typed or a // type-parameterized test. const std::unique_ptr<const ::std::string> type_param_; @@ -833,7 +645,8 @@ class GTEST_API_ TestInfo { // test for the second time. TestResult result_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo); + TestInfo(const TestInfo&) = delete; + TestInfo& operator=(const TestInfo&) = delete; }; // A test suite, which consists of a vector of TestInfos. @@ -941,7 +754,7 @@ class GTEST_API_ TestSuite { // Adds a TestInfo to this test suite. Will delete the TestInfo upon // destruction of the TestSuite object. - void AddTestInfo(TestInfo * test_info); + void AddTestInfo(TestInfo* test_info); // Clears the results of all tests in this test suite. void ClearResult(); @@ -1042,7 +855,8 @@ class GTEST_API_ TestSuite { TestResult ad_hoc_test_result_; // We disallow copying TestSuites. - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite); + TestSuite(const TestSuite&) = delete; + TestSuite& operator=(const TestSuite&) = delete; }; // An Environment object is capable of setting up and tearing down an @@ -1069,6 +883,7 @@ class Environment { // Override this to define how to tear down the environment. virtual void TearDown() {} + private: // If you see an error about overriding the following function or // about it being private, you have mis-spelled SetUp() as Setup(). @@ -1120,6 +935,9 @@ class TestEventListener { // Fired before the test starts. virtual void OnTestStart(const TestInfo& test_info) = 0; + // Fired when a test is disabled + virtual void OnTestDisabled(const TestInfo& /*test_info*/) {} + // Fired after a failed assertion or a SUCCEED() invocation. // If you want to throw an exception from this function to skip to the next // TEST, it must be AssertionException defined above, or inherited from it. @@ -1143,8 +961,7 @@ class TestEventListener { virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0; // Fired after each iteration of tests finishes. - virtual void OnTestIterationEnd(const UnitTest& unit_test, - int iteration) = 0; + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0; // Fired after all test activities have ended. virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0; @@ -1169,6 +986,7 @@ class EmptyTestEventListener : public TestEventListener { #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void OnTestStart(const TestInfo& /*test_info*/) override {} + void OnTestDisabled(const TestInfo& /*test_info*/) override {} void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {} void OnTestEnd(const TestInfo& /*test_info*/) override {} void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {} @@ -1258,7 +1076,8 @@ class GTEST_API_ TestEventListeners { TestEventListener* default_xml_generator_; // We disallow copying TestEventListeners. - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners); + TestEventListeners(const TestEventListeners&) = delete; + TestEventListeners& operator=(const TestEventListeners&) = delete; }; // A UnitTest consists of a vector of TestSuites. @@ -1301,8 +1120,7 @@ class GTEST_API_ UnitTest { // Returns the TestInfo object for the test that's currently running, // or NULL if no test is running. - const TestInfo* current_test_info() const - GTEST_LOCK_EXCLUDED_(mutex_); + const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_); // Returns the random seed used at the start of the current test run. int random_seed() const; @@ -1408,8 +1226,7 @@ class GTEST_API_ UnitTest { // eventually call this to report their results. The user code // should use the assertion macros instead of calling this directly. void AddTestPartResult(TestPartResult::Type result_type, - const char* file_name, - int line_number, + const char* file_name, int line_number, const std::string& message, const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_); @@ -1440,8 +1257,7 @@ class GTEST_API_ UnitTest { friend std::set<std::string>* internal::GetIgnoredParameterizedTestSuites(); friend internal::UnitTestImpl* internal::GetUnitTestImpl(); friend void internal::ReportFailureInUnknownLocation( - TestPartResult::Type result_type, - const std::string& message); + TestPartResult::Type result_type, const std::string& message); // Creates an empty UnitTest. UnitTest(); @@ -1455,8 +1271,7 @@ class GTEST_API_ UnitTest { GTEST_LOCK_EXCLUDED_(mutex_); // Pops a trace from the per-thread Google Test trace stack. - void PopGTestTrace() - GTEST_LOCK_EXCLUDED_(mutex_); + void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_); // Protects mutable state in *impl_. This is mutable as some const // methods need to lock it too. @@ -1469,7 +1284,8 @@ class GTEST_API_ UnitTest { internal::UnitTestImpl* impl_; // We disallow copying UnitTest. - GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest); + UnitTest(const UnitTest&) = delete; + UnitTest& operator=(const UnitTest&) = delete; }; // A convenient wrapper for adding an environment for the test @@ -1520,13 +1336,11 @@ namespace internal { // when calling EXPECT_* in a tight loop. template <typename T1, typename T2> AssertionResult CmpHelperEQFailure(const char* lhs_expression, - const char* rhs_expression, - const T1& lhs, const T2& rhs) { - return EqFailure(lhs_expression, - rhs_expression, + const char* rhs_expression, const T1& lhs, + const T2& rhs) { + return EqFailure(lhs_expression, rhs_expression, FormatForComparisonFailureMessage(lhs, rhs), - FormatForComparisonFailureMessage(rhs, lhs), - false); + FormatForComparisonFailureMessage(rhs, lhs), false); } // This block of code defines operator==/!= @@ -1539,8 +1353,7 @@ inline bool operator!=(faketype, faketype) { return false; } // The helper function for {ASSERT|EXPECT}_EQ. template <typename T1, typename T2> AssertionResult CmpHelperEQ(const char* lhs_expression, - const char* rhs_expression, - const T1& lhs, + const char* rhs_expression, const T1& lhs, const T2& rhs) { if (lhs == rhs) { return AssertionSuccess(); @@ -1571,8 +1384,7 @@ class EqHelper { // Even though its body looks the same as the above version, we // cannot merge the two, as it will make anonymous enums unhappy. static AssertionResult Compare(const char* lhs_expression, - const char* rhs_expression, - BiggestInt lhs, + const char* rhs_expression, BiggestInt lhs, BiggestInt rhs) { return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); } @@ -1607,16 +1419,16 @@ AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2, // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ -template <typename T1, typename T2>\ -AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ - const T1& val1, const T2& val2) {\ - if (val1 op val2) {\ - return AssertionSuccess();\ - } else {\ - return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\ - }\ -} +#define GTEST_IMPL_CMP_HELPER_(op_name, op) \ + template <typename T1, typename T2> \ + AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ + const T1& val1, const T2& val2) { \ + if (val1 op val2) { \ + return AssertionSuccess(); \ + } else { \ + return CmpHelperOpFailure(expr1, expr2, val1, val2, #op); \ + } \ + } // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. @@ -1638,49 +1450,42 @@ GTEST_IMPL_CMP_HELPER_(GT, >) // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, const char* s2_expression, - const char* s1, - const char* s2); + const char* s1, const char* s2); // The helper function for {ASSERT|EXPECT}_STRCASEEQ. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression, const char* s2_expression, - const char* s1, - const char* s2); + const char* s1, const char* s2); // The helper function for {ASSERT|EXPECT}_STRNE. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, const char* s2_expression, - const char* s1, - const char* s2); + const char* s1, const char* s2); // The helper function for {ASSERT|EXPECT}_STRCASENE. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression, const char* s2_expression, - const char* s1, - const char* s2); - + const char* s1, const char* s2); // Helper function for *_STREQ on wide strings. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, const char* s2_expression, - const wchar_t* s1, - const wchar_t* s2); + const wchar_t* s1, const wchar_t* s2); // Helper function for *_STRNE on wide strings. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, const char* s2_expression, - const wchar_t* s1, - const wchar_t* s2); + const wchar_t* s1, const wchar_t* s2); } // namespace internal @@ -1692,32 +1497,40 @@ GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, // // The {needle,haystack}_expr arguments are the stringified // expressions that generated the two real arguments. -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack); -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack); -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const char* needle, + const char* haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const wchar_t* needle, + const wchar_t* haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const char* needle, + const char* haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const wchar_t* needle, + const wchar_t* haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack); #if GTEST_HAS_STD_WSTRING -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack); #endif // GTEST_HAS_STD_WSTRING namespace internal { @@ -1732,8 +1545,7 @@ namespace internal { template <typename RawType> AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, const char* rhs_expression, - RawType lhs_value, - RawType rhs_value) { + RawType lhs_value, RawType rhs_value) { const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value); if (lhs.AlmostEquals(rhs)) { @@ -1748,10 +1560,8 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2) << rhs_value; - return EqFailure(lhs_expression, - rhs_expression, - StringStreamToString(&lhs_ss), - StringStreamToString(&rhs_ss), + return EqFailure(lhs_expression, rhs_expression, + StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss), false); } @@ -1761,8 +1571,7 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2, const char* abs_error_expr, - double val1, - double val2, + double val1, double val2, double abs_error); // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. @@ -1770,9 +1579,7 @@ GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1, class GTEST_API_ AssertHelper { public: // Constructor. - AssertHelper(TestPartResult::Type type, - const char* file, - int line, + AssertHelper(TestPartResult::Type type, const char* file, int line, const char* message); ~AssertHelper(); @@ -1786,11 +1593,9 @@ class GTEST_API_ AssertHelper { // re-using stack space even for temporary variables, so every EXPECT_EQ // reserves stack space for another AssertHelper. struct AssertHelperData { - AssertHelperData(TestPartResult::Type t, - const char* srcfile, - int line_num, + AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num, const char* msg) - : type(t), file(srcfile), line(line_num), message(msg) { } + : type(t), file(srcfile), line(line_num), message(msg) {} TestPartResult::Type const type; const char* const file; @@ -1798,12 +1603,14 @@ class GTEST_API_ AssertHelper { std::string const message; private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData); + AssertHelperData(const AssertHelperData&) = delete; + AssertHelperData& operator=(const AssertHelperData&) = delete; }; AssertHelperData* const data_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper); + AssertHelper(const AssertHelper&) = delete; + AssertHelper& operator=(const AssertHelper&) = delete; }; } // namespace internal @@ -1860,15 +1667,14 @@ class WithParamInterface { private: // Sets parameter value. The caller is responsible for making sure the value // remains alive and unchanged throughout the current test. - static void SetParam(const ParamType* parameter) { - parameter_ = parameter; - } + static void SetParam(const ParamType* parameter) { parameter_ = parameter; } // Static value used for accessing parameter during a test lifetime. static const ParamType* parameter_; // TestClass must be a subclass of WithParamInterface<T> and Test. - template <class TestClass> friend class internal::ParameterizedTestFactory; + template <class TestClass> + friend class internal::ParameterizedTestFactory; }; template <typename T> @@ -1878,8 +1684,7 @@ const T* WithParamInterface<T>::parameter_ = nullptr; // WithParamInterface, and can just inherit from ::testing::TestWithParam. template <typename T> -class TestWithParam : public Test, public WithParamInterface<T> { -}; +class TestWithParam : public Test, public WithParamInterface<T> {}; // Macros for indicating success/failure in test code. @@ -1910,7 +1715,7 @@ class TestWithParam : public Test, public WithParamInterface<T> { // Generates a nonfatal failure at the given source file location with // a generic message. -#define ADD_FAILURE_AT(file, line) \ +#define ADD_FAILURE_AT(file, line) \ GTEST_MESSAGE_AT_(file, line, "Failed", \ ::testing::TestPartResult::kNonFatalFailure) @@ -1925,7 +1730,7 @@ class TestWithParam : public Test, public WithParamInterface<T> { // Define this macro to 1 to omit the definition of FAIL(), which is a // generic name and clashes with some other libraries. #if !GTEST_DONT_DEFINE_FAIL -# define FAIL() GTEST_FAIL() +#define FAIL() GTEST_FAIL() #endif // Generates a success with a generic message. @@ -1934,7 +1739,7 @@ class TestWithParam : public Test, public WithParamInterface<T> { // Define this macro to 1 to omit the definition of SUCCEED(), which // is a generic name and clashes with some other libraries. #if !GTEST_DONT_DEFINE_SUCCEED -# define SUCCEED() GTEST_SUCCEED() +#define SUCCEED() GTEST_SUCCEED() #endif // Macros for testing exceptions. @@ -1962,16 +1767,15 @@ class TestWithParam : public Test, public WithParamInterface<T> { // Boolean assertions. Condition can be either a Boolean expression or an // AssertionResult. For more information on how to use AssertionResult with // these macros see comments on that class. -#define GTEST_EXPECT_TRUE(condition) \ +#define GTEST_EXPECT_TRUE(condition) \ GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ GTEST_NONFATAL_FAILURE_) -#define GTEST_EXPECT_FALSE(condition) \ +#define GTEST_EXPECT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_NONFATAL_FAILURE_) #define GTEST_ASSERT_TRUE(condition) \ - GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ - GTEST_FATAL_FAILURE_) -#define GTEST_ASSERT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_) +#define GTEST_ASSERT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_FATAL_FAILURE_) @@ -2070,27 +1874,27 @@ class TestWithParam : public Test, public WithParamInterface<T> { // ASSERT_XY(), which clashes with some users' own code. #if !GTEST_DONT_DEFINE_ASSERT_EQ -# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) +#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_NE -# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) +#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_LE -# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) +#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_LT -# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) +#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_GE -# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) +#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_GT -# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) +#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) #endif // C-string Comparisons. All tests treat NULL and any non-NULL string @@ -2115,7 +1919,7 @@ class TestWithParam : public Test, public WithParamInterface<T> { EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) #define EXPECT_STRCASEEQ(s1, s2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) -#define EXPECT_STRCASENE(s1, s2)\ +#define EXPECT_STRCASENE(s1, s2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) #define ASSERT_STREQ(s1, s2) \ @@ -2124,7 +1928,7 @@ class TestWithParam : public Test, public WithParamInterface<T> { ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) #define ASSERT_STRCASEEQ(s1, s2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) -#define ASSERT_STRCASENE(s1, s2)\ +#define ASSERT_STRCASENE(s1, s2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) // Macros for comparing floating-point numbers. @@ -2141,29 +1945,29 @@ class TestWithParam : public Test, public WithParamInterface<T> { // FloatingPoint template class in gtest-internal.h if you are // interested in the implementation details. -#define EXPECT_FLOAT_EQ(val1, val2)\ +#define EXPECT_FLOAT_EQ(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \ val1, val2) -#define EXPECT_DOUBLE_EQ(val1, val2)\ +#define EXPECT_DOUBLE_EQ(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \ val1, val2) -#define ASSERT_FLOAT_EQ(val1, val2)\ +#define ASSERT_FLOAT_EQ(val1, val2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \ val1, val2) -#define ASSERT_DOUBLE_EQ(val1, val2)\ +#define ASSERT_DOUBLE_EQ(val1, val2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \ val1, val2) -#define EXPECT_NEAR(val1, val2, abs_error)\ - EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ - val1, val2, abs_error) +#define EXPECT_NEAR(val1, val2, abs_error) \ + EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \ + abs_error) -#define ASSERT_NEAR(val1, val2, abs_error)\ - ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ - val1, val2, abs_error) +#define ASSERT_NEAR(val1, val2, abs_error) \ + ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \ + abs_error) // These predicate format functions work on floating-point values, and // can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g. @@ -2177,7 +1981,6 @@ GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2, GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1, double val2); - #if GTEST_OS_WINDOWS // Macros that test for HRESULT failure and success, these are only useful @@ -2189,17 +1992,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, // expected result and the actual result with both a human-readable // string representation of the error, if available, as well as the // hex result code. -# define EXPECT_HRESULT_SUCCEEDED(expr) \ - EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) +#define EXPECT_HRESULT_SUCCEEDED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) -# define ASSERT_HRESULT_SUCCEEDED(expr) \ - ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) +#define ASSERT_HRESULT_SUCCEEDED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) -# define EXPECT_HRESULT_FAILED(expr) \ - EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) +#define EXPECT_HRESULT_FAILED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) -# define ASSERT_HRESULT_FAILED(expr) \ - ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) +#define ASSERT_HRESULT_FAILED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) #endif // GTEST_OS_WINDOWS @@ -2214,9 +2017,9 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, // ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed"; // #define ASSERT_NO_FATAL_FAILURE(statement) \ - GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) #define EXPECT_NO_FATAL_FAILURE(statement) \ - GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) // Causes a trace (including the given source file path and line number, // and the given message) to be included in every test failure message generated @@ -2258,7 +2061,8 @@ class GTEST_API_ ScopedTrace { private: void PushTrace(const char* file, int line, std::string message); - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); + ScopedTrace(const ScopedTrace&) = delete; + ScopedTrace& operator=(const ScopedTrace&) = delete; } GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its // c'tor and d'tor. Therefore it doesn't // need to be used otherwise. @@ -2278,9 +2082,9 @@ class GTEST_API_ ScopedTrace { // Assuming that each thread maintains its own stack of traces. // Therefore, a SCOPED_TRACE() would (correctly) only affect the // assertions in its own thread. -#define SCOPED_TRACE(message) \ - ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\ - __FILE__, __LINE__, (message)) +#define SCOPED_TRACE(message) \ + ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \ + __FILE__, __LINE__, (message)) // Compile-time assertion for type equality. // StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2 @@ -2378,20 +2182,19 @@ constexpr bool StaticAssertTypeEq() noexcept { // EXPECT_EQ(a_.size(), 0); // EXPECT_EQ(b_.size(), 1); // } -// -// GOOGLETEST_CM0011 DO NOT DELETE -#if !GTEST_DONT_DEFINE_TEST -#define TEST_F(test_fixture, test_name)\ +#define GTEST_TEST_F(test_fixture, test_name) \ GTEST_TEST_(test_fixture, test_name, test_fixture, \ ::testing::internal::GetTypeId<test_fixture>()) -#endif // !GTEST_DONT_DEFINE_TEST +#if !GTEST_DONT_DEFINE_TEST_F +#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name) +#endif // Returns a path to temporary directory. // Tries to determine an appropriate directory for the platform. GTEST_API_ std::string TempDir(); #ifdef _MSC_VER -# pragma warning(pop) +#pragma warning(pop) #endif // Dynamically registers a test with the framework. @@ -2445,6 +2248,7 @@ GTEST_API_ std::string TempDir(); // } // ... // int main(int argc, char** argv) { +// ::testing::InitGoogleTest(&argc, argv); // std::vector<int> values_to_test = LoadValuesFromConfig(); // RegisterMyTests(values_to_test); // ... @@ -2486,9 +2290,7 @@ TestInfo* RegisterTest(const char* test_suite_name, const char* test_name, // namespace and has an all-caps name. int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_; -inline int RUN_ALL_TESTS() { - return ::testing::UnitTest::GetInstance()->Run(); -} +inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); } GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h index 5029a9bb0..47a24aa68 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h @@ -26,17 +26,19 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command -// 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! // // Implements a family of generic predicate assertion macros. -// GOOGLETEST_CM0001 DO NOT DELETE + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ -#include "gtest/gtest.h" +#include "gtest/gtest-assertion-result.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" namespace testing { @@ -72,22 +74,18 @@ namespace testing { // GTEST_ASSERT_ is the basic statement to which all of the assertions // in this file reduce. Don't use this in your code. -#define GTEST_ASSERT_(expression, on_failure) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ +#define GTEST_ASSERT_(expression, on_failure) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ if (const ::testing::AssertionResult gtest_ar = (expression)) \ - ; \ - else \ + ; \ + else \ on_failure(gtest_ar.failure_message()) - // Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use // this in your code. -template <typename Pred, - typename T1> -AssertionResult AssertPred1Helper(const char* pred_text, - const char* e1, - Pred pred, - const T1& v1) { +template <typename Pred, typename T1> +AssertionResult AssertPred1Helper(const char* pred_text, const char* e1, + Pred pred, const T1& v1) { if (pred(v1)) return AssertionSuccess(); return AssertionFailure() @@ -98,40 +96,27 @@ AssertionResult AssertPred1Helper(const char* pred_text, // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. // Don't use this in your code. -#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, v1), \ - on_failure) +#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, v1), on_failure) // Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use // this in your code. -#define GTEST_PRED1_(pred, v1, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \ - #v1, \ - pred, \ - v1), on_failure) +#define GTEST_PRED1_(pred, v1, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure) // Unary predicate assertion macros. #define EXPECT_PRED_FORMAT1(pred_format, v1) \ GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED1(pred, v1) \ - GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) #define ASSERT_PRED_FORMAT1(pred_format, v1) \ GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED1(pred, v1) \ - GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) - - +#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) // Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use // this in your code. -template <typename Pred, - typename T1, - typename T2> -AssertionResult AssertPred2Helper(const char* pred_text, - const char* e1, - const char* e2, - Pred pred, - const T1& v1, +template <typename Pred, typename T1, typename T2> +AssertionResult AssertPred2Helper(const char* pred_text, const char* e1, + const char* e2, Pred pred, const T1& v1, const T2& v2) { if (pred(v1, v2)) return AssertionSuccess(); @@ -145,19 +130,14 @@ AssertionResult AssertPred2Helper(const char* pred_text, // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. // Don't use this in your code. -#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \ - on_failure) +#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure) // Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use // this in your code. -#define GTEST_PRED2_(pred, v1, v2, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \ - #v1, \ - #v2, \ - pred, \ - v1, \ - v2), on_failure) +#define GTEST_PRED2_(pred, v1, v2, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \ + on_failure) // Binary predicate assertion macros. #define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ @@ -169,22 +149,12 @@ AssertionResult AssertPred2Helper(const char* pred_text, #define ASSERT_PRED2(pred, v1, v2) \ GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) - - // Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use // this in your code. -template <typename Pred, - typename T1, - typename T2, - typename T3> -AssertionResult AssertPred3Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3) { +template <typename Pred, typename T1, typename T2, typename T3> +AssertionResult AssertPred3Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, Pred pred, + const T1& v1, const T2& v2, const T3& v3) { if (pred(v1, v2, v3)) return AssertionSuccess(); return AssertionFailure() @@ -198,21 +168,15 @@ AssertionResult AssertPred3Helper(const char* pred_text, // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. // Don't use this in your code. -#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \ - on_failure) +#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure) // Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use // this in your code. -#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - pred, \ - v1, \ - v2, \ - v3), on_failure) +#define GTEST_PRED3_(pred, v1, v2, v3, on_failure) \ + GTEST_ASSERT_( \ + ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \ + on_failure) // Ternary predicate assertion macros. #define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ @@ -224,25 +188,13 @@ AssertionResult AssertPred3Helper(const char* pred_text, #define ASSERT_PRED3(pred, v1, v2, v3) \ GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) - - // Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use // this in your code. -template <typename Pred, - typename T1, - typename T2, - typename T3, - typename T4> -AssertionResult AssertPred4Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - const char* e4, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3, - const T4& v4) { +template <typename Pred, typename T1, typename T2, typename T3, typename T4> +AssertionResult AssertPred4Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, + const char* e4, Pred pred, const T1& v1, + const T2& v2, const T3& v3, const T4& v4) { if (pred(v1, v2, v3, v4)) return AssertionSuccess(); return AssertionFailure() @@ -257,23 +209,15 @@ AssertionResult AssertPred4Helper(const char* pred_text, // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. // Don't use this in your code. -#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \ - on_failure) +#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure) // Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use // this in your code. -#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - #v4, \ - pred, \ - v1, \ - v2, \ - v3, \ - v4), on_failure) +#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \ + v1, v2, v3, v4), \ + on_failure) // 4-ary predicate assertion macros. #define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ @@ -285,28 +229,15 @@ AssertionResult AssertPred4Helper(const char* pred_text, #define ASSERT_PRED4(pred, v1, v2, v3, v4) \ GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) - - // Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use // this in your code. -template <typename Pred, - typename T1, - typename T2, - typename T3, - typename T4, +template <typename Pred, typename T1, typename T2, typename T3, typename T4, typename T5> -AssertionResult AssertPred5Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - const char* e4, - const char* e5, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3, - const T4& v4, - const T5& v5) { +AssertionResult AssertPred5Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, + const char* e4, const char* e5, Pred pred, + const T1& v1, const T2& v2, const T3& v3, + const T4& v4, const T5& v5) { if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); return AssertionFailure() @@ -322,25 +253,16 @@ AssertionResult AssertPred5Helper(const char* pred_text, // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. // Don't use this in your code. -#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\ +#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure) \ GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \ on_failure) // Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use // this in your code. -#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - #v4, \ - #v5, \ - pred, \ - v1, \ - v2, \ - v3, \ - v4, \ - v5), on_failure) +#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \ + pred, v1, v2, v3, v4, v5), \ + on_failure) // 5-ary predicate assertion macros. #define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ @@ -352,8 +274,6 @@ AssertionResult AssertPred5Helper(const char* pred_text, #define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) - - } // namespace testing #endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h b/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h index 38b9d85a5..1f37dc31c 100644 --- a/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h +++ b/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h @@ -27,9 +27,8 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Google C++ Testing and Mocking Framework definitions useful in production code. -// GOOGLETEST_CM0003 DO NOT DELETE +// Google C++ Testing and Mocking Framework definitions useful in production +// code. #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ #define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ @@ -55,7 +54,7 @@ // Note: The test class must be in the same namespace as the class being tested. // For example, putting MyClassTest in an anonymous namespace will not work. -#define FRIEND_TEST(test_case_name, test_name)\ -friend class test_case_name##_##test_name##_Test +#define FRIEND_TEST(test_case_name, test_name) \ + friend class test_case_name##_##test_name##_Test #endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md b/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md index ff391fb4e..cb49e2c75 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md +++ b/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md @@ -15,18 +15,6 @@ The custom directory is an injection point for custom user configurations. The following macros can be defined: -### Flag related macros: - -* `GTEST_FLAG(flag_name)` -* `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its - own flagfile flag parsing. -* `GTEST_DECLARE_bool_(name)` -* `GTEST_DECLARE_int32_(name)` -* `GTEST_DECLARE_string_(name)` -* `GTEST_DEFINE_bool_(name, default_val, doc)` -* `GTEST_DEFINE_int32_(name, default_val, doc)` -* `GTEST_DEFINE_string_(name, default_val, doc)` - ### Logging: * `GTEST_LOG_(severity)` diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h index db02881c0..9b7fb4261 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h +++ b/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h @@ -34,4 +34,35 @@ #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ +// Use a stub Notification class. +// +// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and +// std::condition_variable. The <mutex> and <condition_variable> headers of +// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only +// when configured with the posix threads option but don't define them when +// configured with the win32 threads option. The Notification class is only +// used in GoogleTest's internal tests. Since we don't build GoogleTest's +// internal tests, we don't need a working Notification class. Although it's +// not hard to fix the mingw32 g++ compilation errors by implementing the +// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE, +// it's simpler to just use a stub Notification class on all platforms. +// +// The default constructor of the stub class is deleted and the declaration of +// the Notify() method is commented out, so that compilation will fail if any +// code actually uses the Notification class. + +#define GTEST_HAS_NOTIFICATION_ 1 +namespace testing { +namespace internal { +class Notification { + public: + Notification() = delete; + Notification(const Notification&) = delete; + Notification& operator=(const Notification&) = delete; + // void Notify(); + void WaitForNotification() {} +}; +} // namespace internal +} // namespace testing + #endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h index 490296dfa..45580ae80 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h +++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h @@ -26,27 +26,31 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines internal utilities needed for implementing // death tests. They are subject to change without notice. -// GOOGLETEST_CM0001 DO NOT DELETE + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#include <stdio.h> + +#include <memory> + #include "gtest/gtest-matchers.h" #include "gtest/internal/gtest-internal.h" -#include <stdio.h> -#include <memory> +GTEST_DECLARE_string_(internal_run_death_test); namespace testing { namespace internal { -GTEST_DECLARE_string_(internal_run_death_test); - // Names of the flags (needed for parsing Google Test flags). const char kDeathTestStyleFlag[] = "death_test_style"; const char kDeathTestUseFork[] = "death_test_use_fork"; @@ -83,16 +87,18 @@ class GTEST_API_ DeathTest { static bool Create(const char* statement, Matcher<const std::string&> matcher, const char* file, int line, DeathTest** test); DeathTest(); - virtual ~DeathTest() { } + virtual ~DeathTest() {} // A helper class that aborts a death test when it's deleted. class ReturnSentinel { public: - explicit ReturnSentinel(DeathTest* test) : test_(test) { } + explicit ReturnSentinel(DeathTest* test) : test_(test) {} ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } + private: DeathTest* const test_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel); + ReturnSentinel(const ReturnSentinel&) = delete; + ReturnSentinel& operator=(const ReturnSentinel&) = delete; } GTEST_ATTRIBUTE_UNUSED_; // An enumeration of possible roles that may be taken when a death @@ -137,7 +143,8 @@ class GTEST_API_ DeathTest { // A string containing a description of the outcome of the last death test. static std::string last_death_test_message_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest); + DeathTest(const DeathTest&) = delete; + DeathTest& operator=(const DeathTest&) = delete; }; GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 @@ -145,7 +152,7 @@ GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 // Factory interface for death tests. May be mocked out for testing. class DeathTestFactory { public: - virtual ~DeathTestFactory() { } + virtual ~DeathTestFactory() {} virtual bool Create(const char* statement, Matcher<const std::string&> matcher, const char* file, int line, DeathTest** test) = 0; @@ -186,28 +193,28 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher( // Traps C++ exceptions escaping statement and reports them as test // failures. Note that trapping SEH exceptions is not implemented here. -# if GTEST_HAS_EXCEPTIONS -# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } catch (const ::std::exception& gtest_exception) { \ - fprintf(\ - stderr, \ - "\n%s: Caught std::exception-derived exception escaping the " \ - "death test statement. Exception message: %s\n", \ +#if GTEST_HAS_EXCEPTIONS +#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (const ::std::exception& gtest_exception) { \ + fprintf( \ + stderr, \ + "\n%s: Caught std::exception-derived exception escaping the " \ + "death test statement. Exception message: %s\n", \ ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ - gtest_exception.what()); \ - fflush(stderr); \ + gtest_exception.what()); \ + fflush(stderr); \ death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ - } catch (...) { \ + } catch (...) { \ death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ } -# else -# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ +#else +#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) -# endif +#endif // This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, // ASSERT_EXIT*, and EXPECT_EXIT*. @@ -236,8 +243,6 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher( gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ break; \ } \ - default: \ - break; \ } \ } \ } else \ @@ -265,16 +270,12 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher( // RUN_ALL_TESTS was called. class InternalRunDeathTestFlag { public: - InternalRunDeathTestFlag(const std::string& a_file, - int a_line, - int an_index, + InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index, int a_write_fd) - : file_(a_file), line_(a_line), index_(an_index), - write_fd_(a_write_fd) {} + : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {} ~InternalRunDeathTestFlag() { - if (write_fd_ >= 0) - posix::Close(write_fd_); + if (write_fd_ >= 0) posix::Close(write_fd_); } const std::string& file() const { return file_; } @@ -288,7 +289,8 @@ class InternalRunDeathTestFlag { int index_; int write_fd_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag); + InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete; + InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete; }; // Returns a newly created InternalRunDeathTestFlag object with fields diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h index 0c033abc3..a2a60a962 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h +++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h @@ -26,7 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // Google Test filepath utilities // // This header file declares classes and functions used internally by @@ -35,7 +35,9 @@ // This file is #included in gtest/internal/gtest-internal.h. // Do not include this header file separately! -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ @@ -61,8 +63,8 @@ namespace internal { class GTEST_API_ FilePath { public: - FilePath() : pathname_("") { } - FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { } + FilePath() : pathname_("") {} + FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {} explicit FilePath(const std::string& pathname) : pathname_(pathname) { Normalize(); @@ -73,9 +75,7 @@ class GTEST_API_ FilePath { return *this; } - void Set(const FilePath& rhs) { - pathname_ = rhs.pathname_; - } + void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; } const std::string& string() const { return pathname_; } const char* c_str() const { return pathname_.c_str(); } @@ -88,8 +88,7 @@ class GTEST_API_ FilePath { // than zero (e.g., 12), returns "dir/test_12.xml". // On Windows platform, uses \ as the separator rather than /. static FilePath MakeFileName(const FilePath& directory, - const FilePath& base_name, - int number, + const FilePath& base_name, int number, const char* extension); // Given directory = "dir", relative_path = "test.xml", diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h index f8cbdbd81..9b04e4c85 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h +++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h @@ -26,13 +26,15 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // The Google C++ Testing and Mocking Framework (Google Test) // // This header file declares functions and macros used internally by // Google Test. They are subject to change without notice. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ @@ -40,19 +42,20 @@ #include "gtest/internal/gtest-port.h" #if GTEST_OS_LINUX -# include <stdlib.h> -# include <sys/types.h> -# include <sys/wait.h> -# include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> #endif // GTEST_OS_LINUX #if GTEST_HAS_EXCEPTIONS -# include <stdexcept> +#include <stdexcept> #endif #include <ctype.h> #include <float.h> #include <string.h> + #include <cstdint> #include <iomanip> #include <limits> @@ -76,7 +79,7 @@ // the current line number. For more details, see // http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) -#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar +#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar // Stringifies its argument. // Work around a bug in visual studio which doesn't accept code like this: @@ -98,21 +101,21 @@ namespace testing { // Forward declarations. -class AssertionResult; // Result of an assertion. -class Message; // Represents a failure message. -class Test; // Represents a test. -class TestInfo; // Information about a test. -class TestPartResult; // Result of a test part. -class UnitTest; // A collection of test suites. +class AssertionResult; // Result of an assertion. +class Message; // Represents a failure message. +class Test; // Represents a test. +class TestInfo; // Information about a test. +class TestPartResult; // Result of a test part. +class UnitTest; // A collection of test suites. template <typename T> ::std::string PrintToString(const T& value); namespace internal { -struct TraceInfo; // Information about a trace point. -class TestInfoImpl; // Opaque implementation of TestInfo -class UnitTestImpl; // Opaque implementation of UnitTest +struct TraceInfo; // Information about a trace point. +class TestInfoImpl; // Opaque implementation of TestInfo +class UnitTestImpl; // Opaque implementation of UnitTest // The text used in failure messages to indicate the start of the // stack trace. @@ -121,6 +124,7 @@ GTEST_API_ extern const char kStackTraceMarker[]; // An IgnoredValue object can be implicitly constructed from ANY value. class IgnoredValue { struct Sink {}; + public: // This constructor template allows any value to be implicitly // converted to IgnoredValue. The object has no data member and @@ -136,13 +140,13 @@ class IgnoredValue { }; // Appends the user-supplied message to the Google-Test-generated message. -GTEST_API_ std::string AppendUserMessage( - const std::string& gtest_msg, const Message& user_msg); +GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg, + const Message& user_msg); #if GTEST_HAS_EXCEPTIONS -GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \ -/* an exported class was derived from a class that was not exported */) +GTEST_DISABLE_MSC_WARNINGS_PUSH_( + 4275 /* an exported class was derived from a class that was not exported */) // This exception is thrown by (and only by) a failed Google Test // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions @@ -181,14 +185,6 @@ GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left, } // namespace edit_distance -// Calculate the diff between 'left' and 'right' and return it in unified diff -// format. -// If not null, stores in 'total_line_count' the total number of lines found -// in left + right. -GTEST_API_ std::string DiffStrings(const std::string& left, - const std::string& right, - size_t* total_line_count); - // Constructs and returns the message for an equality assertion // (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. // @@ -212,10 +208,8 @@ GTEST_API_ AssertionResult EqFailure(const char* expected_expression, // Constructs a failure message for Boolean assertions such as EXPECT_TRUE. GTEST_API_ std::string GetBoolAssertionFailureMessage( - const AssertionResult& assertion_result, - const char* expression_text, - const char* actual_predicate_value, - const char* expected_predicate_value); + const AssertionResult& assertion_result, const char* expression_text, + const char* actual_predicate_value, const char* expected_predicate_value); // This template class represents an IEEE floating-point number // (either single-precision or double-precision, depending on the @@ -256,11 +250,11 @@ class FloatingPoint { // Constants. // # of bits in a number. - static const size_t kBitCount = 8*sizeof(RawType); + static const size_t kBitCount = 8 * sizeof(RawType); // # of fraction bits in a number. static const size_t kFractionBitCount = - std::numeric_limits<RawType>::digits - 1; + std::numeric_limits<RawType>::digits - 1; // # of exponent bits in a number. static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; @@ -269,8 +263,8 @@ class FloatingPoint { static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1); // The mask for the fraction bits. - static const Bits kFractionBitMask = - ~static_cast<Bits>(0) >> (kExponentBitCount + 1); + static const Bits kFractionBitMask = ~static_cast<Bits>(0) >> + (kExponentBitCount + 1); // The mask for the exponent bits. static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); @@ -309,9 +303,7 @@ class FloatingPoint { } // Returns the floating-point number that represent positive infinity. - static RawType Infinity() { - return ReinterpretBits(kExponentBitMask); - } + static RawType Infinity() { return ReinterpretBits(kExponentBitMask); } // Returns the maximum representable finite floating-point number. static RawType Max(); @@ -319,7 +311,7 @@ class FloatingPoint { // Non-static methods // Returns the bits that represents this number. - const Bits &bits() const { return u_.bits_; } + const Bits& bits() const { return u_.bits_; } // Returns the exponent bits of this number. Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } @@ -348,8 +340,8 @@ class FloatingPoint { // a NAN must return false. if (is_nan() || rhs.is_nan()) return false; - return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) - <= kMaxUlps; + return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <= + kMaxUlps; } private: @@ -374,7 +366,7 @@ class FloatingPoint { // // Read http://en.wikipedia.org/wiki/Signed_number_representations // for more details on signed number representations. - static Bits SignAndMagnitudeToBiased(const Bits &sam) { + static Bits SignAndMagnitudeToBiased(const Bits& sam) { if (kSignBitMask & sam) { // sam represents a negative number. return ~sam + 1; @@ -386,8 +378,8 @@ class FloatingPoint { // Given two numbers in the sign-and-magnitude representation, // returns the distance between them as an unsigned number. - static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1, - const Bits &sam2) { + static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1, + const Bits& sam2) { const Bits biased1 = SignAndMagnitudeToBiased(sam1); const Bits biased2 = SignAndMagnitudeToBiased(sam2); return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); @@ -399,9 +391,13 @@ class FloatingPoint { // We cannot use std::numeric_limits<T>::max() as it clashes with the max() // macro defined by <windows.h>. template <> -inline float FloatingPoint<float>::Max() { return FLT_MAX; } +inline float FloatingPoint<float>::Max() { + return FLT_MAX; +} template <> -inline double FloatingPoint<double>::Max() { return DBL_MAX; } +inline double FloatingPoint<double>::Max() { + return DBL_MAX; +} // Typedefs the instances of the FloatingPoint template class that we // care to use. @@ -461,7 +457,8 @@ class TestFactoryBase { TestFactoryBase() {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase); + TestFactoryBase(const TestFactoryBase&) = delete; + TestFactoryBase& operator=(const TestFactoryBase&) = delete; }; // This class provides implementation of TeastFactoryBase interface. @@ -510,11 +507,11 @@ inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull( template <typename T> // Note that SuiteApiResolver inherits from T because -// SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way +// SetUpTestSuite()/TearDownTestSuite() could be protected. This way // SuiteApiResolver can access them. struct SuiteApiResolver : T { // testing::Test is only forward declared at this point. So we make it a - // dependend class for the compiler to be OK with it. + // dependent class for the compiler to be OK with it. using Test = typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type; @@ -654,7 +651,8 @@ inline const char* SkipComma(const char* str) { if (comma == nullptr) { return nullptr; } - while (IsSpace(*(++comma))) {} + while (IsSpace(*(++comma))) { + } return comma; } @@ -668,7 +666,7 @@ inline std::string GetPrefixUntilComma(const char* str) { // Splits a given string on a given delimiter, populating a given // vector with the fields. void SplitString(const ::std::string& str, char delimiter, - ::std::vector< ::std::string>* dest); + ::std::vector<::std::string>* dest); // The default argument to the template below for the case when the user does // not provide a name generator. @@ -781,13 +779,13 @@ class TypeParameterizedTestSuite { const std::vector<std::string>& type_names = GenerateNames<DefaultNameGenerator, Types>()) { RegisterTypeParameterizedTestSuiteInstantiation(case_name); - std::string test_name = StripTrailingSpaces( - GetPrefixUntilComma(test_names)); + std::string test_name = + StripTrailingSpaces(GetPrefixUntilComma(test_names)); if (!state->TestExists(test_name)) { fprintf(stderr, "Failed to get code location for test %s.%s at %s.", case_name, test_name.c_str(), - FormatFileLocation(code_location.file.c_str(), - code_location.line).c_str()); + FormatFileLocation(code_location.file.c_str(), code_location.line) + .c_str()); fflush(stderr); posix::Abort(); } @@ -831,8 +829,8 @@ class TypeParameterizedTestSuite<Fixture, internal::None, Types> { // For example, if Foo() calls Bar(), which in turn calls // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. -GTEST_API_ std::string GetCurrentOsStackTraceExceptTop( - UnitTest* unit_test, int skip_count); +GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test, + int skip_count); // Helpers for suppressing warnings on unreachable code or constant // condition. @@ -881,7 +879,8 @@ class GTEST_API_ Random { private: uint32_t state_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); + Random(const Random&) = delete; + Random& operator=(const Random&) = delete; }; // Turns const U&, U&, const U, and U all into U. @@ -954,7 +953,9 @@ IsContainer IsContainerTest(int /* dummy */) { typedef char IsNotContainer; template <class C> -IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; } +IsNotContainer IsContainerTest(long /* dummy */) { + return '\0'; +} // Trait to detect whether a type T is a hash table. // The heuristic used is that the type contains an inner type `hasher` and does @@ -1017,11 +1018,13 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs); // This generic version is used when k is 0. template <typename T, typename U> -inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; } +inline bool ArrayEq(const T& lhs, const U& rhs) { + return lhs == rhs; +} // This overload is used when k >= 1. template <typename T, typename U, size_t N> -inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) { +inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) { return internal::ArrayEq(lhs, N, rhs); } @@ -1031,8 +1034,7 @@ inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) { template <typename T, typename U> bool ArrayEq(const T* lhs, size_t size, const U* rhs) { for (size_t i = 0; i != size; i++) { - if (!internal::ArrayEq(lhs[i], rhs[i])) - return false; + if (!internal::ArrayEq(lhs[i], rhs[i])) return false; } return true; } @@ -1042,8 +1044,7 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs) { template <typename Iter, typename Element> Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) { for (Iter it = begin; it != end; ++it) { - if (internal::ArrayEq(*it, elem)) - return it; + if (internal::ArrayEq(*it, elem)) return it; } return end; } @@ -1057,11 +1058,13 @@ void CopyArray(const T* from, size_t size, U* to); // This generic version is used when k is 0. template <typename T, typename U> -inline void CopyArray(const T& from, U* to) { *to = from; } +inline void CopyArray(const T& from, U* to) { + *to = from; +} // This overload is used when k >= 1. template <typename T, typename U, size_t N> -inline void CopyArray(const T(&from)[N], U(*to)[N]) { +inline void CopyArray(const T (&from)[N], U (*to)[N]) { internal::CopyArray(from, N, *to); } @@ -1114,8 +1117,7 @@ class NativeArray { } ~NativeArray() { - if (clone_ != &NativeArray::InitRef) - delete[] array_; + if (clone_ != &NativeArray::InitRef) delete[] array_; } // STL-style container methods. @@ -1123,8 +1125,7 @@ class NativeArray { const_iterator begin() const { return array_; } const_iterator end() const { return array_ + size_; } bool operator==(const NativeArray& rhs) const { - return size() == rhs.size() && - ArrayEq(begin(), size(), rhs.begin()); + return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin()); } private: @@ -1335,9 +1336,9 @@ struct tuple_size<testing::internal::FlatTuple<Ts...>> #endif } // namespace std -#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ - ::testing::internal::AssertHelper(result_type, file, line, message) \ - = ::testing::Message() +#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ + ::testing::internal::AssertHelper(result_type, file, line, message) = \ + ::testing::Message() #define GTEST_MESSAGE_(message, result_type) \ GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) @@ -1458,103 +1459,112 @@ class NeverThrown { #endif // GTEST_HAS_EXCEPTIONS -#define GTEST_TEST_NO_THROW_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::TrueWithString gtest_msg{}) { \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ - catch (...) { \ - gtest_msg.value = "it throws."; \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \ - fail(("Expected: " #statement " doesn't throw an exception.\n" \ - " Actual: " + gtest_msg.value).c_str()) - -#define GTEST_TEST_ANY_THROW_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - bool gtest_caught_any = false; \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - catch (...) { \ - gtest_caught_any = true; \ - } \ - if (!gtest_caught_any) { \ +#define GTEST_TEST_NO_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::TrueWithString gtest_msg{}) { \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ + catch (...) { \ + gtest_msg.value = "it throws."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__) \ + : fail(("Expected: " #statement " doesn't throw an exception.\n" \ + " Actual: " + \ + gtest_msg.value) \ + .c_str()) + +#define GTEST_TEST_ANY_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + bool gtest_caught_any = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (...) { \ + gtest_caught_any = true; \ + } \ + if (!gtest_caught_any) { \ goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \ - fail("Expected: " #statement " throws an exception.\n" \ - " Actual: it doesn't.") - + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__) \ + : fail("Expected: " #statement \ + " throws an exception.\n" \ + " Actual: it doesn't.") // Implements Boolean test assertions such as EXPECT_TRUE. expression can be // either a boolean expression or an AssertionResult. text is a textual // representation of expression as it was passed into the EXPECT_TRUE. #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (const ::testing::AssertionResult gtest_ar_ = \ - ::testing::AssertionResult(expression)) \ - ; \ - else \ - fail(::testing::internal::GetBoolAssertionFailureMessage(\ - gtest_ar_, text, #actual, #expected).c_str()) - -#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar_ = \ + ::testing::AssertionResult(expression)) \ + ; \ + else \ + fail(::testing::internal::GetBoolAssertionFailureMessage( \ + gtest_ar_, text, #actual, #expected) \ + .c_str()) + +#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \ - fail("Expected: " #statement " doesn't generate new fatal " \ - "failures in the current thread.\n" \ - " Actual: it does.") + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__) \ + : fail("Expected: " #statement \ + " doesn't generate new fatal " \ + "failures in the current thread.\n" \ + " Actual: it does.") // Expands to the name of the class that implements the given test. #define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ test_suite_name##_##test_name##_Test // Helper macro for defining tests. -#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \ - static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \ - "test_suite_name must not be empty"); \ - static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \ - "test_name must not be empty"); \ - class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ - : public parent_class { \ - public: \ - GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \ - ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \ - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ - test_name)); \ - GTEST_DISALLOW_MOVE_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name, \ - test_name)); \ - \ - private: \ - void TestBody() override; \ - static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \ - }; \ - \ - ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \ - test_name)::test_info_ = \ - ::testing::internal::MakeAndRegisterTestInfo( \ - #test_suite_name, #test_name, nullptr, nullptr, \ - ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \ - ::testing::internal::SuiteApiResolver< \ - parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \ - ::testing::internal::SuiteApiResolver< \ - parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \ - new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_( \ - test_suite_name, test_name)>); \ +#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \ + static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \ + "test_suite_name must not be empty"); \ + static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \ + "test_name must not be empty"); \ + class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + : public parent_class { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \ + ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &) = delete; /* NOLINT */ \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &&) noexcept = delete; /* NOLINT */ \ + \ + private: \ + void TestBody() override; \ + static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \ + }; \ + \ + ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)::test_info_ = \ + ::testing::internal::MakeAndRegisterTestInfo( \ + #test_suite_name, #test_name, nullptr, nullptr, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \ + new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_( \ + test_suite_name, test_name)>); \ void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() #endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h index c2ef6e312..e7af2f904 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h +++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h @@ -27,10 +27,11 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - // Type and function utilities for implementing parameterized tests. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ @@ -46,19 +47,18 @@ #include <utility> #include <vector> -#include "gtest/internal/gtest-internal.h" -#include "gtest/internal/gtest-port.h" #include "gtest/gtest-printers.h" #include "gtest/gtest-test-part.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" namespace testing { // Input to a parameterized test name generator, describing a test parameter. // Consists of the parameter value and the integer parameter index. template <class ParamType> struct TestParamInfo { - TestParamInfo(const ParamType& a_param, size_t an_index) : - param(a_param), - index(an_index) {} + TestParamInfo(const ParamType& a_param, size_t an_index) + : param(a_param), index(an_index) {} ParamType param; size_t index; }; @@ -84,8 +84,10 @@ namespace internal { GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name, CodeLocation code_location); -template <typename> class ParamGeneratorInterface; -template <typename> class ParamGenerator; +template <typename> +class ParamGeneratorInterface; +template <typename> +class ParamGenerator; // Interface for iterating over elements provided by an implementation // of ParamGeneratorInterface<T>. @@ -129,8 +131,7 @@ class ParamIterator { // ParamIterator assumes ownership of the impl_ pointer. ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {} ParamIterator& operator=(const ParamIterator& other) { - if (this != &other) - impl_.reset(other.impl_->Clone()); + if (this != &other) impl_.reset(other.impl_->Clone()); return *this; } @@ -157,7 +158,7 @@ class ParamIterator { private: friend class ParamGenerator<T>; explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {} - std::unique_ptr<ParamIteratorInterface<T> > impl_; + std::unique_ptr<ParamIteratorInterface<T>> impl_; }; // ParamGeneratorInterface<T> is the binary interface to access generators @@ -179,7 +180,7 @@ class ParamGeneratorInterface { // This class implements copy initialization semantics and the contained // ParamGeneratorInterface<T> instance is shared among all copies // of the original object. This is possible because that instance is immutable. -template<typename T> +template <typename T> class ParamGenerator { public: typedef ParamIterator<T> iterator; @@ -196,7 +197,7 @@ class ParamGenerator { iterator end() const { return iterator(impl_->End()); } private: - std::shared_ptr<const ParamGeneratorInterface<T> > impl_; + std::shared_ptr<const ParamGeneratorInterface<T>> impl_; }; // Generates values from a range of two comparable values. Can be used to @@ -207,8 +208,10 @@ template <typename T, typename IncrementT> class RangeGenerator : public ParamGeneratorInterface<T> { public: RangeGenerator(T begin, T end, IncrementT step) - : begin_(begin), end_(end), - step_(step), end_index_(CalculateEndIndex(begin, end, step)) {} + : begin_(begin), + end_(end), + step_(step), + end_index_(CalculateEndIndex(begin, end, step)) {} ~RangeGenerator() override {} ParamIteratorInterface<T>* Begin() const override { @@ -251,7 +254,9 @@ class RangeGenerator : public ParamGeneratorInterface<T> { private: Iterator(const Iterator& other) : ParamIteratorInterface<T>(), - base_(other.base_), value_(other.value_), index_(other.index_), + base_(other.base_), + value_(other.value_), + index_(other.index_), step_(other.step_) {} // No implementation - assignment is unsupported. @@ -263,12 +268,10 @@ class RangeGenerator : public ParamGeneratorInterface<T> { const IncrementT step_; }; // class RangeGenerator::Iterator - static int CalculateEndIndex(const T& begin, - const T& end, + static int CalculateEndIndex(const T& begin, const T& end, const IncrementT& step) { int end_index = 0; - for (T i = begin; i < end; i = static_cast<T>(i + step)) - end_index++; + for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++; return end_index; } @@ -283,7 +286,6 @@ class RangeGenerator : public ParamGeneratorInterface<T> { const int end_index_; }; // class RangeGenerator - // Generates values from a pair of STL-style iterators. Used in the // ValuesIn() function. The elements are copied from the source range // since the source can be located on the stack, and the generator @@ -341,13 +343,13 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> { << "The program attempted to compare iterators " << "from different generators." << std::endl; return iterator_ == - CheckedDowncastToActualType<const Iterator>(&other)->iterator_; + CheckedDowncastToActualType<const Iterator>(&other)->iterator_; } private: Iterator(const Iterator& other) - // The explicit constructor call suppresses a false warning - // emitted by gcc when supplied with the -Wextra option. + // The explicit constructor call suppresses a false warning + // emitted by gcc when supplied with the -Wextra option. : ParamIteratorInterface<T>(), base_(other.base_), iterator_(other.iterator_) {} @@ -394,8 +396,8 @@ template <class TestClass> class ParameterizedTestFactory : public TestFactoryBase { public: typedef typename TestClass::ParamType ParamType; - explicit ParameterizedTestFactory(ParamType parameter) : - parameter_(parameter) {} + explicit ParameterizedTestFactory(ParamType parameter) + : parameter_(parameter) {} Test* CreateTest() override { TestClass::SetParam(¶meter_); return new TestClass(); @@ -404,7 +406,8 @@ class ParameterizedTestFactory : public TestFactoryBase { private: const ParamType parameter_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory); + ParameterizedTestFactory(const ParameterizedTestFactory&) = delete; + ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete; }; // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. @@ -440,7 +443,8 @@ class TestMetaFactory } private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory); + TestMetaFactory(const TestMetaFactory&) = delete; + TestMetaFactory& operator=(const TestMetaFactory&) = delete; }; // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. @@ -471,7 +475,10 @@ class ParameterizedTestSuiteInfoBase { ParameterizedTestSuiteInfoBase() {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase); + ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) = + delete; + ParameterizedTestSuiteInfoBase& operator=( + const ParameterizedTestSuiteInfoBase&) = delete; }; // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. @@ -547,8 +554,8 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { test_it != tests_.end(); ++test_it) { std::shared_ptr<TestInfo> test_info = *test_it; for (typename InstantiationContainer::iterator gen_it = - instantiations_.begin(); gen_it != instantiations_.end(); - ++gen_it) { + instantiations_.begin(); + gen_it != instantiations_.end(); ++gen_it) { const std::string& instantiation_name = gen_it->name; ParamGenerator<ParamType> generator((*gen_it->generator)()); ParamNameGeneratorFunc* name_func = gen_it->name_func; @@ -556,7 +563,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { int line = gen_it->line; std::string test_suite_name; - if ( !instantiation_name.empty() ) + if (!instantiation_name.empty()) test_suite_name = instantiation_name + "/"; test_suite_name += test_info->test_suite_base_name; @@ -569,17 +576,16 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { Message test_name_stream; - std::string param_name = name_func( - TestParamInfo<ParamType>(*param_it, i)); + std::string param_name = + name_func(TestParamInfo<ParamType>(*param_it, i)); GTEST_CHECK_(IsValidParamName(param_name)) << "Parameterized test name '" << param_name - << "' is invalid, in " << file - << " line " << line << std::endl; + << "' is invalid, in " << file << " line " << line << std::endl; GTEST_CHECK_(test_param_names.count(param_name) == 0) - << "Duplicate parameterized test name '" << param_name - << "', in " << file << " line " << line << std::endl; + << "Duplicate parameterized test name '" << param_name << "', in " + << file << " line " << line << std::endl; test_param_names.insert(param_name); @@ -596,15 +602,15 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line), test_info->test_meta_factory->CreateTestFactory(*param_it)); } // for param_it - } // for gen_it - } // for test_it + } // for gen_it + } // for test_it if (!generated_instantiations) { // There are no generaotrs, or they all generate nothing ... InsertSyntheticTestCase(GetTestSuiteName(), code_location_, !tests_.empty()); } - } // RegisterTests + } // RegisterTests private: // LocalTestInfo structure keeps information about a single test registered @@ -620,42 +626,39 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { const std::string test_suite_base_name; const std::string test_base_name; - const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory; + const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory; const CodeLocation code_location; }; - using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >; + using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>; // Records data received from INSTANTIATE_TEST_SUITE_P macros: // <Instantiation name, Sequence generator creation function, // Name generator function, Source file, Source line> struct InstantiationInfo { - InstantiationInfo(const std::string &name_in, - GeneratorCreationFunc* generator_in, - ParamNameGeneratorFunc* name_func_in, - const char* file_in, - int line_in) - : name(name_in), - generator(generator_in), - name_func(name_func_in), - file(file_in), - line(line_in) {} - - std::string name; - GeneratorCreationFunc* generator; - ParamNameGeneratorFunc* name_func; - const char* file; - int line; + InstantiationInfo(const std::string& name_in, + GeneratorCreationFunc* generator_in, + ParamNameGeneratorFunc* name_func_in, const char* file_in, + int line_in) + : name(name_in), + generator(generator_in), + name_func(name_func_in), + file(file_in), + line(line_in) {} + + std::string name; + GeneratorCreationFunc* generator; + ParamNameGeneratorFunc* name_func; + const char* file; + int line; }; typedef ::std::vector<InstantiationInfo> InstantiationContainer; static bool IsValidParamName(const std::string& name) { // Check for empty string - if (name.empty()) - return false; + if (name.empty()) return false; // Check for invalid characters for (std::string::size_type index = 0; index < name.size(); ++index) { - if (!IsAlNum(name[index]) && name[index] != '_') - return false; + if (!IsAlNum(name[index]) && name[index] != '_') return false; } return true; @@ -666,7 +669,9 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { TestInfoContainer tests_; InstantiationContainer instantiations_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo); + ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete; + ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) = + delete; }; // class ParameterizedTestSuiteInfo // Legacy API is deprecated but still available @@ -709,7 +714,7 @@ class ParameterizedTestSuiteRegistry { // type we are looking for, so we downcast it to that type // without further checks. typed_test_info = CheckedDowncastToActualType< - ParameterizedTestSuiteInfo<TestSuite> >(test_suite_info); + ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info); } break; } @@ -741,7 +746,10 @@ class ParameterizedTestSuiteRegistry { TestSuiteInfoContainer test_suite_infos_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry); + ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) = + delete; + ParameterizedTestSuiteRegistry& operator=( + const ParameterizedTestSuiteRegistry&) = delete; }; // Keep track of what type-parameterized test suite are defined and @@ -836,7 +844,8 @@ class CartesianProductGenerator : public ParamIteratorInterface<ParamType> { public: IteratorImpl(const ParamGeneratorInterface<ParamType>* base, - const std::tuple<ParamGenerator<T>...>& generators, bool is_end) + const std::tuple<ParamGenerator<T>...>& generators, + bool is_end) : base_(base), begin_(std::get<I>(generators).begin()...), end_(std::get<I>(generators).end()...), diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h index dd845915e..f025db76a 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h +++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h @@ -26,7 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the GTEST_OS_* macro. @@ -37,70 +37,72 @@ // Determines the platform on which Google Test is compiled. #ifdef __CYGWIN__ -# define GTEST_OS_CYGWIN 1 -# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__) -# define GTEST_OS_WINDOWS_MINGW 1 -# define GTEST_OS_WINDOWS 1 +#define GTEST_OS_CYGWIN 1 +#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__) +#define GTEST_OS_WINDOWS_MINGW 1 +#define GTEST_OS_WINDOWS 1 #elif defined _WIN32 -# define GTEST_OS_WINDOWS 1 -# ifdef _WIN32_WCE -# define GTEST_OS_WINDOWS_MOBILE 1 -# elif defined(WINAPI_FAMILY) -# include <winapifamily.h> -# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -# define GTEST_OS_WINDOWS_DESKTOP 1 -# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) -# define GTEST_OS_WINDOWS_PHONE 1 -# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) -# define GTEST_OS_WINDOWS_RT 1 -# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE) -# define GTEST_OS_WINDOWS_PHONE 1 -# define GTEST_OS_WINDOWS_TV_TITLE 1 -# else - // WINAPI_FAMILY defined but no known partition matched. - // Default to desktop. -# define GTEST_OS_WINDOWS_DESKTOP 1 -# endif -# else -# define GTEST_OS_WINDOWS_DESKTOP 1 -# endif // _WIN32_WCE +#define GTEST_OS_WINDOWS 1 +#ifdef _WIN32_WCE +#define GTEST_OS_WINDOWS_MOBILE 1 +#elif defined(WINAPI_FAMILY) +#include <winapifamily.h> +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define GTEST_OS_WINDOWS_DESKTOP 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) +#define GTEST_OS_WINDOWS_PHONE 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) +#define GTEST_OS_WINDOWS_RT 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE) +#define GTEST_OS_WINDOWS_PHONE 1 +#define GTEST_OS_WINDOWS_TV_TITLE 1 +#else +// WINAPI_FAMILY defined but no known partition matched. +// Default to desktop. +#define GTEST_OS_WINDOWS_DESKTOP 1 +#endif +#else +#define GTEST_OS_WINDOWS_DESKTOP 1 +#endif // _WIN32_WCE #elif defined __OS2__ -# define GTEST_OS_OS2 1 +#define GTEST_OS_OS2 1 #elif defined __APPLE__ -# define GTEST_OS_MAC 1 -# include <TargetConditionals.h> -# if TARGET_OS_IPHONE -# define GTEST_OS_IOS 1 -# endif +#define GTEST_OS_MAC 1 +#include <TargetConditionals.h> +#if TARGET_OS_IPHONE +#define GTEST_OS_IOS 1 +#endif #elif defined __DragonFly__ -# define GTEST_OS_DRAGONFLY 1 +#define GTEST_OS_DRAGONFLY 1 #elif defined __FreeBSD__ -# define GTEST_OS_FREEBSD 1 +#define GTEST_OS_FREEBSD 1 #elif defined __Fuchsia__ -# define GTEST_OS_FUCHSIA 1 +#define GTEST_OS_FUCHSIA 1 +#elif defined(__GNU__) +#define GTEST_OS_GNU_HURD 1 #elif defined(__GLIBC__) && defined(__FreeBSD_kernel__) -# define GTEST_OS_GNU_KFREEBSD 1 +#define GTEST_OS_GNU_KFREEBSD 1 #elif defined __linux__ -# define GTEST_OS_LINUX 1 -# if defined __ANDROID__ -# define GTEST_OS_LINUX_ANDROID 1 -# endif +#define GTEST_OS_LINUX 1 +#if defined __ANDROID__ +#define GTEST_OS_LINUX_ANDROID 1 +#endif #elif defined __MVS__ -# define GTEST_OS_ZOS 1 +#define GTEST_OS_ZOS 1 #elif defined(__sun) && defined(__SVR4) -# define GTEST_OS_SOLARIS 1 +#define GTEST_OS_SOLARIS 1 #elif defined(_AIX) -# define GTEST_OS_AIX 1 +#define GTEST_OS_AIX 1 #elif defined(__hpux) -# define GTEST_OS_HPUX 1 +#define GTEST_OS_HPUX 1 #elif defined __native_client__ -# define GTEST_OS_NACL 1 +#define GTEST_OS_NACL 1 #elif defined __NetBSD__ -# define GTEST_OS_NETBSD 1 +#define GTEST_OS_NETBSD 1 #elif defined __OpenBSD__ -# define GTEST_OS_OPENBSD 1 +#define GTEST_OS_OPENBSD 1 #elif defined __QNX__ -# define GTEST_OS_QNX 1 +#define GTEST_OS_QNX 1 #elif defined(__HAIKU__) #define GTEST_OS_HAIKU 1 #elif defined ESP8266 diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h index 0953a781c..0003d2765 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h +++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h @@ -26,7 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // Low-level types and utilities for porting Google Test to various // platforms. All macros ending with _ and symbols defined in an // internal namespace are subject to change without notice. Code @@ -38,7 +38,9 @@ // files are expected to #include this. Therefore, it cannot #include // any other Google Test header. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ @@ -116,6 +118,7 @@ // GTEST_OS_DRAGONFLY - DragonFlyBSD // GTEST_OS_FREEBSD - FreeBSD // GTEST_OS_FUCHSIA - Fuchsia +// GTEST_OS_GNU_HURD - GNU/Hurd // GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD // GTEST_OS_HAIKU - Haiku // GTEST_OS_HPUX - HP-UX @@ -167,7 +170,7 @@ // GTEST_HAS_TYPED_TEST - typed tests // GTEST_HAS_TYPED_TEST_P - type-parameterized tests // GTEST_IS_THREADSAFE - Google Test is thread-safe. -// GOOGLETEST_CM0007 DO NOT DELETE +// GTEST_USES_RE2 - the RE2 regular expression library is used // GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with // GTEST_HAS_POSIX_RE (see above) which users can // define themselves. @@ -190,10 +193,6 @@ // GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. // GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a // variable don't have to be used. -// GTEST_DISALLOW_ASSIGN_ - disables copy operator=. -// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=. -// GTEST_DISALLOW_MOVE_ASSIGN_ - disables move operator=. -// GTEST_DISALLOW_MOVE_AND_ASSIGN_ - disables move ctor and operator=. // GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. // GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is // suppressed (constant conditional). @@ -217,11 +216,13 @@ // - synchronization primitives. // // Regular expressions: -// RE - a simple regular expression class using the POSIX -// Extended Regular Expression syntax on UNIX-like platforms -// GOOGLETEST_CM0008 DO NOT DELETE -// or a reduced regular exception syntax on other -// platforms, including Windows. +// RE - a simple regular expression class using +// 1) the RE2 syntax on all platforms when built with RE2 +// and Abseil as dependencies +// 2) the POSIX Extended Regular Expression syntax on +// UNIX-like platforms, +// 3) A reduced regular exception syntax on other platforms, +// including Windows. // Logging: // GTEST_LOG_() - logs messages at the specified severity level. // LogToStderr() - directs all log messages to stderr. @@ -241,8 +242,6 @@ // BiggestInt - the biggest signed integer type. // // Command-line utilities: -// GTEST_DECLARE_*() - declares a flag. -// GTEST_DEFINE_*() - defines a flag. // GetInjectableArgvs() - returns the command line as a vector of strings. // // Environment variable utilities: @@ -263,48 +262,55 @@ #include <string.h> #include <cerrno> +// #include <condition_variable> // Guarded by GTEST_IS_THREADSAFE below #include <cstdint> +#include <iostream> #include <limits> +#include <locale> +#include <memory> +#include <string> +// #include <mutex> // Guarded by GTEST_IS_THREADSAFE below +#include <tuple> #include <type_traits> +#include <vector> #ifndef _WIN32_WCE -# include <sys/types.h> -# include <sys/stat.h> +#include <sys/stat.h> +#include <sys/types.h> #endif // !_WIN32_WCE #if defined __APPLE__ -# include <AvailabilityMacros.h> -# include <TargetConditionals.h> +#include <AvailabilityMacros.h> +#include <TargetConditionals.h> #endif -#include <iostream> // NOLINT -#include <locale> -#include <memory> -#include <string> // NOLINT -#include <tuple> -#include <vector> // NOLINT - #include "gtest/internal/custom/gtest-port.h" #include "gtest/internal/gtest-port-arch.h" +#if GTEST_HAS_ABSL +#include "absl/flags/declare.h" +#include "absl/flags/flag.h" +#include "absl/flags/reflection.h" +#endif + #if !defined(GTEST_DEV_EMAIL_) -# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" -# define GTEST_FLAG_PREFIX_ "gtest_" -# define GTEST_FLAG_PREFIX_DASH_ "gtest-" -# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" -# define GTEST_NAME_ "Google Test" -# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/" +#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" +#define GTEST_FLAG_PREFIX_ "gtest_" +#define GTEST_FLAG_PREFIX_DASH_ "gtest-" +#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" +#define GTEST_NAME_ "Google Test" +#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/" #endif // !defined(GTEST_DEV_EMAIL_) #if !defined(GTEST_INIT_GOOGLE_TEST_NAME_) -# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest" +#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest" #endif // !defined(GTEST_INIT_GOOGLE_TEST_NAME_) // Determines the version of gcc that is used to compile this. #ifdef __GNUC__ // 40302 means version 4.3.2. -# define GTEST_GCC_VER_ \ - (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) +#define GTEST_GCC_VER_ \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) #endif // __GNUC__ // Macros for disabling Microsoft Visual C++ warnings. @@ -313,41 +319,37 @@ // /* code that triggers warnings C4800 and C4385 */ // GTEST_DISABLE_MSC_WARNINGS_POP_() #if defined(_MSC_VER) -# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ - __pragma(warning(push)) \ - __pragma(warning(disable: warnings)) -# define GTEST_DISABLE_MSC_WARNINGS_POP_() \ - __pragma(warning(pop)) +#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ + __pragma(warning(push)) __pragma(warning(disable : warnings)) +#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop)) #else // Not all compilers are MSVC -# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) -# define GTEST_DISABLE_MSC_WARNINGS_POP_() +#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) +#define GTEST_DISABLE_MSC_WARNINGS_POP_() #endif // Clang on Windows does not understand MSVC's pragma warning. // We need clang-specific way to disable function deprecation warning. #ifdef __clang__ -# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ - _Pragma("clang diagnostic push") \ - _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \ - _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"") -#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \ - _Pragma("clang diagnostic pop") +#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"") +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop") #else -# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ - GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) -# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \ - GTEST_DISABLE_MSC_WARNINGS_POP_() +#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_() #endif // Brings in definitions for functions used in the testing::internal::posix // namespace (read, write, close, chdir, isatty, stat). We do not currently // use them on Windows Mobile. #if GTEST_OS_WINDOWS -# if !GTEST_OS_WINDOWS_MOBILE -# include <direct.h> -# include <io.h> -# endif +#if !GTEST_OS_WINDOWS_MOBILE +#include <direct.h> +#include <io.h> +#endif // In order to avoid having to include <windows.h>, use forward declaration #if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR) // MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two @@ -367,68 +369,55 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // This assumes that non-Windows OSes provide unistd.h. For OSes where this // is not the case, we need to include headers that provide the functions // mentioned above. -# include <unistd.h> -# include <strings.h> +#include <strings.h> +#include <unistd.h> #endif // GTEST_OS_WINDOWS #if GTEST_OS_LINUX_ANDROID // Used to define __ANDROID_API__ matching the target NDK API level. -# include <android/api-level.h> // NOLINT +#include <android/api-level.h> // NOLINT #endif // Defines this to true if and only if Google Test can use POSIX regular // expressions. #ifndef GTEST_HAS_POSIX_RE -# if GTEST_OS_LINUX_ANDROID +#if GTEST_OS_LINUX_ANDROID // On Android, <regex.h> is only available starting with Gingerbread. -# define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) -# else +#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) +#else #define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA) -# endif +#endif #endif -#if GTEST_USES_PCRE -// The appropriate headers have already been included. - +// Select the regular expression implementation. +#if GTEST_HAS_ABSL +// When using Abseil, RE2 is required. +#include "absl/strings/string_view.h" +#include "re2/re2.h" +#define GTEST_USES_RE2 1 #elif GTEST_HAS_POSIX_RE - -// On some platforms, <regex.h> needs someone to define size_t, and -// won't compile otherwise. We can #include it here as we already -// included <stdlib.h>, which is guaranteed to define size_t through -// <stddef.h>. -# include <regex.h> // NOLINT - -# define GTEST_USES_POSIX_RE 1 - -#elif GTEST_OS_WINDOWS - -// <regex.h> is not available on Windows. Use our own simple regex -// implementation instead. -# define GTEST_USES_SIMPLE_RE 1 - +#include <regex.h> // NOLINT +#define GTEST_USES_POSIX_RE 1 #else - -// <regex.h> may not be available on this platform. Use our own -// simple regex implementation instead. -# define GTEST_USES_SIMPLE_RE 1 - -#endif // GTEST_USES_PCRE +// Use our own simple regex implementation. +#define GTEST_USES_SIMPLE_RE 1 +#endif #ifndef GTEST_HAS_EXCEPTIONS // The user didn't tell us whether exceptions are enabled, so we need // to figure it out. -# if defined(_MSC_VER) && defined(_CPPUNWIND) +#if defined(_MSC_VER) && defined(_CPPUNWIND) // MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__BORLANDC__) +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__BORLANDC__) // C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS // macro to enable exceptions, so we'll do the same. // Assumes that exceptions are enabled by default. -# ifndef _HAS_EXCEPTIONS -# define _HAS_EXCEPTIONS 1 -# endif // _HAS_EXCEPTIONS -# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS -# elif defined(__clang__) +#ifndef _HAS_EXCEPTIONS +#define _HAS_EXCEPTIONS 1 +#endif // _HAS_EXCEPTIONS +#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS +#elif defined(__clang__) // clang defines __EXCEPTIONS if and only if exceptions are enabled before clang // 220714, but if and only if cleanups are enabled after that. In Obj-C++ files, // there can be cleanups for ObjC exceptions which also need cleanups, even if @@ -437,27 +426,27 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // cleanups prior to that. To reliably check for C++ exception availability with // clang, check for // __EXCEPTIONS && __has_feature(cxx_exceptions). -# define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions)) -# elif defined(__GNUC__) && __EXCEPTIONS +#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions)) +#elif defined(__GNUC__) && __EXCEPTIONS // gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__SUNPRO_CC) +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__SUNPRO_CC) // Sun Pro CC supports exceptions. However, there is no compile-time way of // detecting whether they are enabled or not. Therefore, we assume that // they are enabled unless the user tells us otherwise. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__IBMCPP__) && __EXCEPTIONS +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__IBMCPP__) && __EXCEPTIONS // xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__HP_aCC) +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__HP_aCC) // Exception handling is in effect by default in HP aCC compiler. It has to // be turned of by +noeh compiler option if desired. -# define GTEST_HAS_EXCEPTIONS 1 -# else +#define GTEST_HAS_EXCEPTIONS 1 +#else // For other compilers, we assume exceptions are disabled to be // conservative. -# define GTEST_HAS_EXCEPTIONS 0 -# endif // defined(_MSC_VER) || defined(__BORLANDC__) +#define GTEST_HAS_EXCEPTIONS 0 +#endif // defined(_MSC_VER) || defined(__BORLANDC__) #endif // GTEST_HAS_EXCEPTIONS #ifndef GTEST_HAS_STD_WSTRING @@ -477,63 +466,62 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // The user didn't tell us whether RTTI is enabled, so we need to // figure it out. -# ifdef _MSC_VER +#ifdef _MSC_VER #ifdef _CPPRTTI // MSVC defines this macro if and only if RTTI is enabled. -# define GTEST_HAS_RTTI 1 -# else -# define GTEST_HAS_RTTI 0 -# endif +#define GTEST_HAS_RTTI 1 +#else +#define GTEST_HAS_RTTI 0 +#endif // Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is // enabled. -# elif defined(__GNUC__) +#elif defined(__GNUC__) -# ifdef __GXX_RTTI +#ifdef __GXX_RTTI // When building against STLport with the Android NDK and with // -frtti -fno-exceptions, the build fails at link time with undefined // references to __cxa_bad_typeid. Note sure if STL or toolchain bug, // so disable RTTI when detected. -# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \ - !defined(__EXCEPTIONS) -# define GTEST_HAS_RTTI 0 -# else -# define GTEST_HAS_RTTI 1 -# endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS -# else -# define GTEST_HAS_RTTI 0 -# endif // __GXX_RTTI +#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS) +#define GTEST_HAS_RTTI 0 +#else +#define GTEST_HAS_RTTI 1 +#endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS +#else +#define GTEST_HAS_RTTI 0 +#endif // __GXX_RTTI // Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends // using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the // first version with C++ support. -# elif defined(__clang__) +#elif defined(__clang__) -# define GTEST_HAS_RTTI __has_feature(cxx_rtti) +#define GTEST_HAS_RTTI __has_feature(cxx_rtti) // Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if // both the typeid and dynamic_cast features are present. -# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) +#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) -# ifdef __RTTI_ALL__ -# define GTEST_HAS_RTTI 1 -# else -# define GTEST_HAS_RTTI 0 -# endif +#ifdef __RTTI_ALL__ +#define GTEST_HAS_RTTI 1 +#else +#define GTEST_HAS_RTTI 0 +#endif -# else +#else // For all other compilers, we assume RTTI is enabled. -# define GTEST_HAS_RTTI 1 +#define GTEST_HAS_RTTI 1 -# endif // _MSC_VER +#endif // _MSC_VER #endif // GTEST_HAS_RTTI // It's this header's responsibility to #include <typeinfo> when RTTI // is enabled. #if GTEST_HAS_RTTI -# include <typeinfo> +#include <typeinfo> #endif // Determines whether Google Test can use the pthreads library. @@ -547,16 +535,16 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \ GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD || \ - GTEST_OS_HAIKU) + GTEST_OS_HAIKU || GTEST_OS_GNU_HURD) #endif // GTEST_HAS_PTHREAD #if GTEST_HAS_PTHREAD // gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is // true. -# include <pthread.h> // NOLINT +#include <pthread.h> // NOLINT // For timespec and nanosleep, used below. -# include <time.h> // NOLINT +#include <time.h> // NOLINT #endif // Determines whether clone(2) is supported. @@ -566,24 +554,23 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; #ifndef GTEST_HAS_CLONE // The user didn't tell us, so we need to figure it out. -# if GTEST_OS_LINUX && !defined(__ia64__) -# if GTEST_OS_LINUX_ANDROID +#if GTEST_OS_LINUX && !defined(__ia64__) +#if GTEST_OS_LINUX_ANDROID // On Android, clone() became available at different API levels for each 32-bit // architecture. -# if defined(__LP64__) || \ - (defined(__arm__) && __ANDROID_API__ >= 9) || \ - (defined(__mips__) && __ANDROID_API__ >= 12) || \ - (defined(__i386__) && __ANDROID_API__ >= 17) -# define GTEST_HAS_CLONE 1 -# else -# define GTEST_HAS_CLONE 0 -# endif -# else -# define GTEST_HAS_CLONE 1 -# endif -# else -# define GTEST_HAS_CLONE 0 -# endif // GTEST_OS_LINUX && !defined(__ia64__) +#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \ + (defined(__mips__) && __ANDROID_API__ >= 12) || \ + (defined(__i386__) && __ANDROID_API__ >= 17) +#define GTEST_HAS_CLONE 1 +#else +#define GTEST_HAS_CLONE 0 +#endif +#else +#define GTEST_HAS_CLONE 1 +#endif +#else +#define GTEST_HAS_CLONE 0 +#endif // GTEST_OS_LINUX && !defined(__ia64__) #endif // GTEST_HAS_CLONE @@ -594,10 +581,10 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // platforms except known mobile ones. #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA -# define GTEST_HAS_STREAM_REDIRECTION 0 -# else -# define GTEST_HAS_STREAM_REDIRECTION 1 -# endif // !GTEST_OS_WINDOWS_MOBILE +#define GTEST_HAS_STREAM_REDIRECTION 0 +#else +#define GTEST_HAS_STREAM_REDIRECTION 1 +#endif // !GTEST_OS_WINDOWS_MOBILE #endif // GTEST_HAS_STREAM_REDIRECTION // Determines whether to support death tests. @@ -607,8 +594,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW || \ GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \ GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ - GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU) -# define GTEST_HAS_DEATH_TEST 1 + GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU || \ + GTEST_OS_GNU_HURD) +#define GTEST_HAS_DEATH_TEST 1 #endif // Determines whether to support type-driven tests. @@ -617,8 +605,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // Sun Pro CC, IBM Visual Age, and HP aCC support. #if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \ defined(__IBMCPP__) || defined(__HP_aCC) -# define GTEST_HAS_TYPED_TEST 1 -# define GTEST_HAS_TYPED_TEST_P 1 +#define GTEST_HAS_TYPED_TEST 1 +#define GTEST_HAS_TYPED_TEST_P 1 #endif // Determines whether the system compiler uses UTF-16 for encoding wide strings. @@ -627,8 +615,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // Determines whether test results can be streamed to a socket. #if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \ - GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD -# define GTEST_CAN_STREAM_RESULTS_ 1 + GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD || \ + GTEST_OS_GNU_HURD +#define GTEST_CAN_STREAM_RESULTS_ 1 #endif // Defines some utility macros. @@ -642,9 +631,12 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // // The "switch (0) case 0:" idiom is used to suppress this. #ifdef __INTEL_COMPILER -# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ +#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ #else -# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default: // NOLINT +#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + switch (0) \ + case 0: \ + default: // NOLINT #endif // Use this annotation at the end of a struct/class definition to @@ -659,55 +651,32 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // Also use it after a variable or parameter declaration to tell the // compiler the variable/parameter does not have to be used. #if defined(__GNUC__) && !defined(COMPILER_ICC) -# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) +#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused)) #elif defined(__clang__) -# if __has_attribute(unused) -# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) -# endif +#if __has_attribute(unused) +#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused)) +#endif #endif #ifndef GTEST_ATTRIBUTE_UNUSED_ -# define GTEST_ATTRIBUTE_UNUSED_ +#define GTEST_ATTRIBUTE_UNUSED_ #endif // Use this annotation before a function that takes a printf format string. #if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC) -# if defined(__MINGW_PRINTF_FORMAT) +#if defined(__MINGW_PRINTF_FORMAT) // MinGW has two different printf implementations. Ensure the format macro // matches the selected implementation. See // https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/. -# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ - __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \ - first_to_check))) -# else -# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ - __attribute__((__format__(__printf__, string_index, first_to_check))) -# endif +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__(( \ + __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check))) #else -# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#else +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) #endif - - -// A macro to disallow copy operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_ASSIGN_(type) \ - type& operator=(type const &) = delete - -// A macro to disallow copy constructor and operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \ - type(type const&) = delete; \ - type& operator=(type const&) = delete - -// A macro to disallow move operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_MOVE_ASSIGN_(type) \ - type& operator=(type &&) noexcept = delete - -// A macro to disallow move constructor and operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \ - type(type&&) noexcept = delete; \ - type& operator=(type&&) noexcept = delete // Tell the compiler to warn about unused return values for functions declared // with this macro. The macro should be used on function declarations @@ -715,9 +684,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // // Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; #if defined(__GNUC__) && !defined(COMPILER_ICC) -# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result)) +#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result)) #else -# define GTEST_MUST_USE_RESULT_ +#define GTEST_MUST_USE_RESULT_ #endif // __GNUC__ && !COMPILER_ICC // MS C++ compiler emits warning when a conditional expression is compile time @@ -728,10 +697,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // while (true) { // GTEST_INTENTIONAL_CONST_COND_POP_() // } -# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \ - GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127) -# define GTEST_INTENTIONAL_CONST_COND_POP_() \ - GTEST_DISABLE_MSC_WARNINGS_POP_() +#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127) +#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_() // Determine whether the compiler supports Microsoft's Structured Exception // Handling. This is supported by several Windows compilers but generally @@ -739,13 +707,13 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; #ifndef GTEST_HAS_SEH // The user didn't tell us, so we need to figure it out. -# if defined(_MSC_VER) || defined(__BORLANDC__) +#if defined(_MSC_VER) || defined(__BORLANDC__) // These two compilers are known to support SEH. -# define GTEST_HAS_SEH 1 -# else +#define GTEST_HAS_SEH 1 +#else // Assume no SEH. -# define GTEST_HAS_SEH 0 -# endif +#define GTEST_HAS_SEH 0 +#endif #endif // GTEST_HAS_SEH @@ -758,94 +726,112 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; #endif // GTEST_IS_THREADSAFE +#if GTEST_IS_THREADSAFE +// Some platforms don't support including these threading related headers. +#include <condition_variable> // NOLINT +#include <mutex> // NOLINT +#endif // GTEST_IS_THREADSAFE + // GTEST_API_ qualifies all symbols that must be exported. The definitions below // are guarded by #ifndef to give embedders a chance to define GTEST_API_ in // gtest/internal/custom/gtest-port.h #ifndef GTEST_API_ #ifdef _MSC_VER -# if GTEST_LINKED_AS_SHARED_LIBRARY -# define GTEST_API_ __declspec(dllimport) -# elif GTEST_CREATE_SHARED_LIBRARY -# define GTEST_API_ __declspec(dllexport) -# endif +#if GTEST_LINKED_AS_SHARED_LIBRARY +#define GTEST_API_ __declspec(dllimport) +#elif GTEST_CREATE_SHARED_LIBRARY +#define GTEST_API_ __declspec(dllexport) +#endif #elif __GNUC__ >= 4 || defined(__clang__) -# define GTEST_API_ __attribute__((visibility ("default"))) +#define GTEST_API_ __attribute__((visibility("default"))) #endif // _MSC_VER #endif // GTEST_API_ #ifndef GTEST_API_ -# define GTEST_API_ +#define GTEST_API_ #endif // GTEST_API_ #ifndef GTEST_DEFAULT_DEATH_TEST_STYLE -# define GTEST_DEFAULT_DEATH_TEST_STYLE "fast" +#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast" #endif // GTEST_DEFAULT_DEATH_TEST_STYLE #ifdef __GNUC__ // Ask the compiler to never inline a given function. -# define GTEST_NO_INLINE_ __attribute__((noinline)) +#define GTEST_NO_INLINE_ __attribute__((noinline)) #else -# define GTEST_NO_INLINE_ +#define GTEST_NO_INLINE_ +#endif + +#if defined(__clang__) +// Nested ifs to avoid triggering MSVC warning. +#if __has_attribute(disable_tail_calls) +// Ask the compiler not to perform tail call optimization inside +// the marked function. +#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls)) +#endif +#elif __GNUC__ +#define GTEST_NO_TAIL_CALL_ \ + __attribute__((optimize("no-optimize-sibling-calls"))) +#else +#define GTEST_NO_TAIL_CALL_ #endif // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. #if !defined(GTEST_HAS_CXXABI_H_) -# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) -# define GTEST_HAS_CXXABI_H_ 1 -# else -# define GTEST_HAS_CXXABI_H_ 0 -# endif +#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) +#define GTEST_HAS_CXXABI_H_ 1 +#else +#define GTEST_HAS_CXXABI_H_ 0 +#endif #endif // A function level attribute to disable checking for use of uninitialized // memory when built with MemorySanitizer. #if defined(__clang__) -# if __has_feature(memory_sanitizer) -# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \ - __attribute__((no_sanitize_memory)) -# else -# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ -# endif // __has_feature(memory_sanitizer) +#if __has_feature(memory_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#endif // __has_feature(memory_sanitizer) #else -# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ #endif // __clang__ // A function level attribute to disable AddressSanitizer instrumentation. #if defined(__clang__) -# if __has_feature(address_sanitizer) -# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \ - __attribute__((no_sanitize_address)) -# else -# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ -# endif // __has_feature(address_sanitizer) +#if __has_feature(address_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \ + __attribute__((no_sanitize_address)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#endif // __has_feature(address_sanitizer) #else -# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ #endif // __clang__ // A function level attribute to disable HWAddressSanitizer instrumentation. #if defined(__clang__) -# if __has_feature(hwaddress_sanitizer) -# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \ - __attribute__((no_sanitize("hwaddress"))) -# else -# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ -# endif // __has_feature(hwaddress_sanitizer) +#if __has_feature(hwaddress_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \ + __attribute__((no_sanitize("hwaddress"))) #else -# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#endif // __has_feature(hwaddress_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ #endif // __clang__ // A function level attribute to disable ThreadSanitizer instrumentation. #if defined(__clang__) -# if __has_feature(thread_sanitizer) -# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \ - __attribute__((no_sanitize_thread)) -# else -# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ -# endif // __has_feature(thread_sanitizer) +#if __has_feature(thread_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#endif // __has_feature(thread_sanitizer) #else -# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ #endif // __clang__ namespace testing { @@ -867,25 +853,37 @@ namespace internal { // Secret object, which is what we want. class Secret; -// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile -// time expression is true (in new code, use static_assert instead). For -// example, you could use it to verify the size of a static array: -// -// GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES, -// names_incorrect_size); -// -// The second argument to the macro must be a valid C++ identifier. If the -// expression is false, compiler will issue an error containing this identifier. -#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg) - // A helper for suppressing warnings on constant condition. It just // returns 'condition'. GTEST_API_ bool IsTrue(bool condition); // Defines RE. -#if GTEST_USES_PCRE -// if used, PCRE is injected by custom/gtest-port.h +#if GTEST_USES_RE2 + +// This is almost `using RE = ::RE2`, except it is copy-constructible, and it +// needs to disambiguate the `std::string`, `absl::string_view`, and `const +// char*` constructors. +class GTEST_API_ RE { + public: + RE(absl::string_view regex) : regex_(regex) {} // NOLINT + RE(const char* regex) : RE(absl::string_view(regex)) {} // NOLINT + RE(const std::string& regex) : RE(absl::string_view(regex)) {} // NOLINT + RE(const RE& other) : RE(other.pattern()) {} + + const std::string& pattern() const { return regex_.pattern(); } + + static bool FullMatch(absl::string_view str, const RE& re) { + return RE2::FullMatch(str, re.regex_); + } + static bool PartialMatch(absl::string_view str, const RE& re) { + return RE2::PartialMatch(str, re.regex_); + } + + private: + RE2 regex_; +}; + #elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE // A simple C++ wrapper for <regex.h>. It uses the POSIX Extended @@ -924,19 +922,19 @@ class GTEST_API_ RE { const char* pattern_; bool is_valid_; -# if GTEST_USES_POSIX_RE +#if GTEST_USES_POSIX_RE regex_t full_regex_; // For FullMatch(). regex_t partial_regex_; // For PartialMatch(). -# else // GTEST_USES_SIMPLE_RE +#else // GTEST_USES_SIMPLE_RE const char* full_pattern_; // For FullMatch(); -# endif +#endif }; -#endif // GTEST_USES_PCRE +#endif // ::testing::internal::RE implementation // Formats a source file path and a line number as they would appear // in an error message from the compiler used to compile this code. @@ -954,12 +952,7 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, // LogToStderr() - directs all log messages to stderr. // FlushInfoLog() - flushes informational log messages. -enum GTestLogSeverity { - GTEST_INFO, - GTEST_WARNING, - GTEST_ERROR, - GTEST_FATAL -}; +enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL }; // Formats log entry severity, provides a stream object for streaming the // log message, and terminates the message with a newline when going out of @@ -976,14 +969,16 @@ class GTEST_API_ GTestLog { private: const GTestLogSeverity severity_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog); + GTestLog(const GTestLog&) = delete; + GTestLog& operator=(const GTestLog&) = delete; }; #if !defined(GTEST_LOG_) -# define GTEST_LOG_(severity) \ - ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ - __FILE__, __LINE__).GetStream() +#define GTEST_LOG_(severity) \ + ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ + __FILE__, __LINE__) \ + .GetStream() inline void LogToStderr() {} inline void FlushInfoLog() { fflush(nullptr); } @@ -995,7 +990,7 @@ inline void FlushInfoLog() { fflush(nullptr); } // // GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition // is not satisfied. -// Synopsys: +// Synopsis: // GTEST_CHECK_(boolean_condition); // or // GTEST_CHECK_(boolean_condition) << "Additional message"; @@ -1005,12 +1000,12 @@ inline void FlushInfoLog() { fflush(nullptr); } // condition itself, plus additional message streamed into it, if any, // and then it aborts the program. It aborts the program irrespective of // whether it is built in the debug mode or not. -# define GTEST_CHECK_(condition) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::IsTrue(condition)) \ - ; \ - else \ - GTEST_LOG_(FATAL) << "Condition " #condition " failed. " +#define GTEST_CHECK_(condition) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::IsTrue(condition)) \ + ; \ + else \ + GTEST_LOG_(FATAL) << "Condition " #condition " failed. " #endif // !defined(GTEST_CHECK_) // An all-mode assert to verify that the given POSIX-style function @@ -1019,9 +1014,8 @@ inline void FlushInfoLog() { fflush(nullptr); } // in {} if you need to use it as the only statement in an 'if' // branch. #define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ - if (const int gtest_error = (posix_call)) \ - GTEST_LOG_(FATAL) << #posix_call << "failed with error " \ - << gtest_error + if (const int gtest_error = (posix_call)) \ + GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error // Transforms "T" into "const T&" according to standard reference collapsing // rules (this is only needed as a backport for C++98 compilers that do not @@ -1035,9 +1029,13 @@ inline void FlushInfoLog() { fflush(nullptr); } // Note that the non-const reference will not have "const" added. This is // standard, and necessary so that "T" can always bind to "const T&". template <typename T> -struct ConstRef { typedef const T& type; }; +struct ConstRef { + typedef const T& type; +}; template <typename T> -struct ConstRef<T&> { typedef T& type; }; +struct ConstRef<T&> { + typedef T& type; +}; // The argument T must depend on some template parameters. #define GTEST_REFERENCE_TO_CONST_(T) \ @@ -1050,7 +1048,7 @@ struct ConstRef<T&> { typedef T& type; }; // const Foo*). When you use ImplicitCast_, the compiler checks that // the cast is safe. Such explicit ImplicitCast_s are necessary in // surprisingly many situations where C++ demands an exact type match -// instead of an argument type convertable to a target type. +// instead of an argument type convertible to a target type. // // The syntax for using ImplicitCast_ is the same as for static_cast: // @@ -1063,8 +1061,10 @@ struct ConstRef<T&> { typedef T& type; }; // This relatively ugly name is intentional. It prevents clashes with // similar functions users may have (e.g., implicit_cast). The internal // namespace alone is not enough because the function can be found by ADL. -template<typename To> -inline To ImplicitCast_(To x) { return x; } +template <typename To> +inline To ImplicitCast_(To x) { + return x; +} // When you upcast (that is, cast a pointer from type Foo to type // SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts @@ -1087,17 +1087,17 @@ inline To ImplicitCast_(To x) { return x; } // This relatively ugly name is intentional. It prevents clashes with // similar functions users may have (e.g., down_cast). The internal // namespace alone is not enough because the function can be found by ADL. -template<typename To, typename From> // use like this: DownCast_<T*>(foo); -inline To DownCast_(From* f) { // so we only accept pointers +template <typename To, typename From> // use like this: DownCast_<T*>(foo); +inline To DownCast_(From* f) { // so we only accept pointers // Ensures that To is a sub-type of From *. This test is here only // for compile-time type checking, and has no overhead in an // optimized build at run-time, as it will be optimized away // completely. GTEST_INTENTIONAL_CONST_COND_PUSH_() if (false) { - GTEST_INTENTIONAL_CONST_COND_POP_() - const To to = nullptr; - ::testing::internal::ImplicitCast_<From*>(to); + GTEST_INTENTIONAL_CONST_COND_POP_() + const To to = nullptr; + ::testing::internal::ImplicitCast_<From*>(to); } #if GTEST_HAS_RTTI @@ -1162,71 +1162,8 @@ void ClearInjectableArgvs(); // Defines synchronization primitives. #if GTEST_IS_THREADSAFE -# if GTEST_HAS_PTHREAD -// Sleeps for (roughly) n milliseconds. This function is only for testing -// Google Test's own constructs. Don't use it in user tests, either -// directly or indirectly. -inline void SleepMilliseconds(int n) { - const timespec time = { - 0, // 0 seconds. - n * 1000L * 1000L, // And n ms. - }; - nanosleep(&time, nullptr); -} -# endif // GTEST_HAS_PTHREAD - -# if GTEST_HAS_NOTIFICATION_ -// Notification has already been imported into the namespace. -// Nothing to do here. - -# elif GTEST_HAS_PTHREAD -// Allows a controller thread to pause execution of newly created -// threads until notified. Instances of this class must be created -// and destroyed in the controller thread. -// -// This class is only for testing Google Test's own constructs. Do not -// use it in user tests, either directly or indirectly. -class Notification { - public: - Notification() : notified_(false) { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr)); - } - ~Notification() { - pthread_mutex_destroy(&mutex_); - } - - // Notifies all threads created with this notification to start. Must - // be called from the controller thread. - void Notify() { - pthread_mutex_lock(&mutex_); - notified_ = true; - pthread_mutex_unlock(&mutex_); - } - - // Blocks until the controller thread notifies. Must be called from a test - // thread. - void WaitForNotification() { - for (;;) { - pthread_mutex_lock(&mutex_); - const bool notified = notified_; - pthread_mutex_unlock(&mutex_); - if (notified) - break; - SleepMilliseconds(10); - } - } - - private: - pthread_mutex_t mutex_; - bool notified_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); -}; - -# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT - -GTEST_API_ void SleepMilliseconds(int n); +#if GTEST_OS_WINDOWS // Provides leak-safe Windows kernel handle ownership. // Used in death tests and in threading support. class GTEST_API_ AutoHandle { @@ -1253,8 +1190,18 @@ class GTEST_API_ AutoHandle { Handle handle_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle); + AutoHandle(const AutoHandle&) = delete; + AutoHandle& operator=(const AutoHandle&) = delete; }; +#endif + +#if GTEST_HAS_NOTIFICATION_ +// Notification has already been imported into the namespace. +// Nothing to do here. + +#else +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) // Allows a controller thread to pause execution of newly created // threads until notified. Instances of this class must be created @@ -1262,23 +1209,40 @@ class GTEST_API_ AutoHandle { // // This class is only for testing Google Test's own constructs. Do not // use it in user tests, either directly or indirectly. +// TODO(b/203539622): Replace unconditionally with absl::Notification. class GTEST_API_ Notification { public: - Notification(); - void Notify(); - void WaitForNotification(); + Notification() : notified_(false) {} + Notification(const Notification&) = delete; + Notification& operator=(const Notification&) = delete; - private: - AutoHandle event_; + // Notifies all threads created with this notification to start. Must + // be called from the controller thread. + void Notify() { + std::lock_guard<std::mutex> lock(mu_); + notified_ = true; + cv_.notify_all(); + } - GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); + // Blocks until the controller thread notifies. Must be called from a test + // thread. + void WaitForNotification() { + std::unique_lock<std::mutex> lock(mu_); + cv_.wait(lock, [this]() { return notified_; }); + } + + private: + std::mutex mu_; + std::condition_variable cv_; + bool notified_; }; -# endif // GTEST_HAS_NOTIFICATION_ +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 +#endif // GTEST_HAS_NOTIFICATION_ // On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD // defined, but we don't want to use MinGW's pthreads implementation, which // has conformance problems with some versions of the POSIX standard. -# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW +#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW // As a C-function, ThreadFuncWithCLinkage cannot be templated itself. // Consequently, it cannot select a correct instantiation of ThreadWithParam @@ -1354,16 +1318,17 @@ class ThreadWithParam : public ThreadWithParamBase { // finished. pthread_t thread_; // The native thread object. - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); + ThreadWithParam(const ThreadWithParam&) = delete; + ThreadWithParam& operator=(const ThreadWithParam&) = delete; }; -# endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD || - // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +#endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD || + // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ -# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ // Mutex and ThreadLocal have already been imported into the namespace. // Nothing to do here. -# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT +#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT // Mutex implements mutex on Windows platforms. It is used in conjunction // with class MutexLock: @@ -1417,14 +1382,15 @@ class GTEST_API_ Mutex { long critical_section_init_phase_; // NOLINT GTEST_CRITICAL_SECTION* critical_section_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; }; -# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ - extern ::testing::internal::Mutex mutex +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex -# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ - ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex) +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex) // We cannot name this class MutexLock because the ctor declaration would // conflict with a macro named MutexLock, which is defined on some @@ -1433,15 +1399,15 @@ class GTEST_API_ Mutex { // "MutexLock l(&mu)". Hence the typedef trick below. class GTestMutexLock { public: - explicit GTestMutexLock(Mutex* mutex) - : mutex_(mutex) { mutex_->Lock(); } + explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); } ~GTestMutexLock() { mutex_->Unlock(); } private: Mutex* const mutex_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); + GTestMutexLock(const GTestMutexLock&) = delete; + GTestMutexLock& operator=(const GTestMutexLock&) = delete; }; typedef GTestMutexLock MutexLock; @@ -1468,7 +1434,8 @@ class ThreadLocalBase { virtual ~ThreadLocalBase() {} private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase); + ThreadLocalBase(const ThreadLocalBase&) = delete; + ThreadLocalBase& operator=(const ThreadLocalBase&) = delete; }; // Maps a thread to a set of ThreadLocals that have values instantiated on that @@ -1497,7 +1464,7 @@ class GTEST_API_ ThreadWithParamBase { virtual void Run() = 0; }; - ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start); + ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start); virtual ~ThreadWithParamBase(); private: @@ -1511,30 +1478,26 @@ class ThreadWithParam : public ThreadWithParamBase { typedef void UserThreadFunc(T); ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start) - : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) { - } + : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {} virtual ~ThreadWithParam() {} private: class RunnableImpl : public Runnable { public: - RunnableImpl(UserThreadFunc* func, T param) - : func_(func), - param_(param) { - } + RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {} virtual ~RunnableImpl() {} - virtual void Run() { - func_(param_); - } + virtual void Run() { func_(param_); } private: UserThreadFunc* const func_; const T param_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl); + RunnableImpl(const RunnableImpl&) = delete; + RunnableImpl& operator=(const RunnableImpl&) = delete; }; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); + ThreadWithParam(const ThreadWithParam&) = delete; + ThreadWithParam& operator=(const ThreadWithParam&) = delete; }; // Implements thread-local storage on Windows systems. @@ -1571,7 +1534,7 @@ class ThreadLocal : public ThreadLocalBase { explicit ThreadLocal(const T& value) : default_factory_(new InstanceValueHolderFactory(value)) {} - ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); } + ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); } T* pointer() { return GetOrCreateValue(); } const T* pointer() const { return GetOrCreateValue(); } @@ -1590,16 +1553,17 @@ class ThreadLocal : public ThreadLocalBase { private: T value_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + ValueHolder(const ValueHolder&) = delete; + ValueHolder& operator=(const ValueHolder&) = delete; }; - T* GetOrCreateValue() const { return static_cast<ValueHolder*>( - ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer(); + ThreadLocalRegistry::GetValueOnCurrentThread(this)) + ->pointer(); } - virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const { + ThreadLocalValueHolderBase* NewValueForCurrentThread() const override { return default_factory_->MakeNewHolder(); } @@ -1610,7 +1574,8 @@ class ThreadLocal : public ThreadLocalBase { virtual ValueHolder* MakeNewHolder() const = 0; private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory); + ValueHolderFactory(const ValueHolderFactory&) = delete; + ValueHolderFactory& operator=(const ValueHolderFactory&) = delete; }; class DefaultValueHolderFactory : public ValueHolderFactory { @@ -1619,7 +1584,9 @@ class ThreadLocal : public ThreadLocalBase { ValueHolder* MakeNewHolder() const override { return new ValueHolder(); } private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); + DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete; + DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) = + delete; }; class InstanceValueHolderFactory : public ValueHolderFactory { @@ -1632,15 +1599,18 @@ class ThreadLocal : public ThreadLocalBase { private: const T value_; // The value for each thread. - GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); + InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete; + InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) = + delete; }; std::unique_ptr<ValueHolderFactory> default_factory_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); + ThreadLocal(const ThreadLocal&) = delete; + ThreadLocal& operator=(const ThreadLocal&) = delete; }; -# elif GTEST_HAS_PTHREAD +#elif GTEST_HAS_PTHREAD // MutexBase and Mutex implement mutex on pthreads-based platforms. class MutexBase { @@ -1687,8 +1657,8 @@ class MutexBase { }; // Forward-declares a static mutex. -# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ - extern ::testing::internal::MutexBase mutex +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::MutexBase mutex // Defines and statically (i.e. at link time) initializes a static mutex. // The initialization list here does not explicitly initialize each field, @@ -1707,12 +1677,11 @@ class Mutex : public MutexBase { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr)); has_owner_ = false; } - ~Mutex() { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); - } + ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); } private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; }; // We cannot name this class MutexLock because the ctor declaration would @@ -1722,15 +1691,15 @@ class Mutex : public MutexBase { // "MutexLock l(&mu)". Hence the typedef trick below. class GTestMutexLock { public: - explicit GTestMutexLock(MutexBase* mutex) - : mutex_(mutex) { mutex_->Lock(); } + explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); } ~GTestMutexLock() { mutex_->Unlock(); } private: MutexBase* const mutex_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); + GTestMutexLock(const GTestMutexLock&) = delete; + GTestMutexLock& operator=(const GTestMutexLock&) = delete; }; typedef GTestMutexLock MutexLock; @@ -1787,7 +1756,8 @@ class GTEST_API_ ThreadLocal { private: T value_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + ValueHolder(const ValueHolder&) = delete; + ValueHolder& operator=(const ValueHolder&) = delete; }; static pthread_key_t CreateKey() { @@ -1819,7 +1789,8 @@ class GTEST_API_ ThreadLocal { virtual ValueHolder* MakeNewHolder() const = 0; private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory); + ValueHolderFactory(const ValueHolderFactory&) = delete; + ValueHolderFactory& operator=(const ValueHolderFactory&) = delete; }; class DefaultValueHolderFactory : public ValueHolderFactory { @@ -1828,7 +1799,9 @@ class GTEST_API_ ThreadLocal { ValueHolder* MakeNewHolder() const override { return new ValueHolder(); } private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory); + DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete; + DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) = + delete; }; class InstanceValueHolderFactory : public ValueHolderFactory { @@ -1841,17 +1814,20 @@ class GTEST_API_ ThreadLocal { private: const T value_; // The value for each thread. - GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory); + InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete; + InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) = + delete; }; // A key pthreads uses for looking up per-thread values. const pthread_key_t key_; std::unique_ptr<ValueHolderFactory> default_factory_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); + ThreadLocal(const ThreadLocal&) = delete; + ThreadLocal& operator=(const ThreadLocal&) = delete; }; -# endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +#endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ #else // GTEST_IS_THREADSAFE @@ -1868,10 +1844,10 @@ class Mutex { void AssertHeld() const {} }; -# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ extern ::testing::internal::Mutex mutex -# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex // We cannot name this class MutexLock because the ctor declaration would // conflict with a macro named MutexLock, which is defined on some @@ -1894,6 +1870,7 @@ class GTEST_API_ ThreadLocal { const T* pointer() const { return &value_; } const T& get() const { return value_; } void set(const T& value) { value_ = value; } + private: T value_; }; @@ -1905,11 +1882,11 @@ class GTEST_API_ ThreadLocal { GTEST_API_ size_t GetThreadCount(); #if GTEST_OS_WINDOWS -# define GTEST_PATH_SEP_ "\\" -# define GTEST_HAS_ALT_PATH_SEP_ 1 +#define GTEST_PATH_SEP_ "\\" +#define GTEST_HAS_ALT_PATH_SEP_ 1 #else -# define GTEST_PATH_SEP_ "/" -# define GTEST_HAS_ALT_PATH_SEP_ 0 +#define GTEST_PATH_SEP_ "/" +#define GTEST_HAS_ALT_PATH_SEP_ 0 #endif // GTEST_OS_WINDOWS // Utilities for char. @@ -1967,8 +1944,7 @@ inline char ToUpper(char ch) { inline std::string StripTrailingSpaces(std::string str) { std::string::iterator it = str.end(); - while (it != str.begin() && IsSpace(*--it)) - it = str.erase(it); + while (it != str.begin() && IsSpace(*--it)) it = str.erase(it); return str; } @@ -1986,36 +1962,35 @@ namespace posix { typedef struct _stat StatStruct; -# ifdef __BORLANDC__ +#ifdef __BORLANDC__ inline int DoIsATTY(int fd) { return isatty(fd); } inline int StrCaseCmp(const char* s1, const char* s2) { return stricmp(s1, s2); } inline char* StrDup(const char* src) { return strdup(src); } -# else // !__BORLANDC__ -# if GTEST_OS_WINDOWS_MOBILE +#else // !__BORLANDC__ +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \ + GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM) inline int DoIsATTY(int /* fd */) { return 0; } -# else +#else inline int DoIsATTY(int fd) { return _isatty(fd); } -# endif // GTEST_OS_WINDOWS_MOBILE +#endif // GTEST_OS_WINDOWS_MOBILE inline int StrCaseCmp(const char* s1, const char* s2) { return _stricmp(s1, s2); } inline char* StrDup(const char* src) { return _strdup(src); } -# endif // __BORLANDC__ +#endif // __BORLANDC__ -# if GTEST_OS_WINDOWS_MOBILE +#if GTEST_OS_WINDOWS_MOBILE inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); } // Stat(), RmDir(), and IsDir() are not needed on Windows CE at this // time and thus not defined there. -# else +#else inline int FileNo(FILE* file) { return _fileno(file); } inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); } inline int RmDir(const char* dir) { return _rmdir(dir); } -inline bool IsDir(const StatStruct& st) { - return (_S_IFDIR & st.st_mode) != 0; -} -# endif // GTEST_OS_WINDOWS_MOBILE +inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; } +#endif // GTEST_OS_WINDOWS_MOBILE #elif GTEST_OS_ESP8266 typedef struct stat StatStruct; @@ -2079,12 +2054,12 @@ inline FILE* FOpen(const char* path, const char* mode) { std::wstring wide_path = converter.from_bytes(path); std::wstring wide_mode = converter.from_bytes(mode); return _wfopen(wide_path.c_str(), wide_mode.c_str()); -#else // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW +#else // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW return fopen(path, mode); #endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW } #if !GTEST_OS_WINDOWS_MOBILE -inline FILE *FReopen(const char* path, const char* mode, FILE* stream) { +inline FILE* FReopen(const char* path, const char* mode, FILE* stream) { return freopen(path, mode, stream); } inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); } @@ -2136,13 +2111,13 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_() // snprintf is a variadic function. #if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE // MSVC 2005 and above support variadic macros. -# define GTEST_SNPRINTF_(buffer, size, format, ...) \ - _snprintf_s(buffer, size, size, format, __VA_ARGS__) +#define GTEST_SNPRINTF_(buffer, size, format, ...) \ + _snprintf_s(buffer, size, size, format, __VA_ARGS__) #elif defined(_MSC_VER) // Windows CE does not define _snprintf_s -# define GTEST_SNPRINTF_ _snprintf +#define GTEST_SNPRINTF_ _snprintf #else -# define GTEST_SNPRINTF_ snprintf +#define GTEST_SNPRINTF_ snprintf #endif // The biggest signed integer type the compiler supports. @@ -2202,37 +2177,84 @@ using TimeInMillis = int64_t; // Represents time in milliseconds. // Macro for referencing flags. #if !defined(GTEST_FLAG) -# define GTEST_FLAG(name) FLAGS_gtest_##name +#define GTEST_FLAG_NAME_(name) gtest_##name +#define GTEST_FLAG(name) FLAGS_gtest_##name #endif // !defined(GTEST_FLAG) -#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_) -# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1 -#endif // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_) +// Pick a command line flags implementation. +#if GTEST_HAS_ABSL -#if !defined(GTEST_DECLARE_bool_) -# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver +// Macros for defining flags. +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc) +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc) +#define GTEST_DEFINE_string_(name, default_val, doc) \ + ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc) // Macros for declaring flags. -# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) -# define GTEST_DECLARE_int32_(name) \ - GTEST_API_ extern std::int32_t GTEST_FLAG(name) -# define GTEST_DECLARE_string_(name) \ - GTEST_API_ extern ::std::string GTEST_FLAG(name) +#define GTEST_DECLARE_bool_(name) \ + ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name)) +#define GTEST_DECLARE_int32_(name) \ + ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name)) +#define GTEST_DECLARE_string_(name) \ + ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name)) + +#define GTEST_FLAG_SAVER_ ::absl::FlagSaver + +#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name)) +#define GTEST_FLAG_SET(name, value) \ + (void)(::absl::SetFlag(>EST_FLAG(name), value)) +#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0 + +#else // GTEST_HAS_ABSL // Macros for defining flags. -# define GTEST_DEFINE_bool_(name, default_val, doc) \ - GTEST_API_ bool GTEST_FLAG(name) = (default_val) -# define GTEST_DEFINE_int32_(name, default_val, doc) \ - GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val) -# define GTEST_DEFINE_string_(name, default_val, doc) \ - GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val) +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ bool GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DEFINE_string_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") -#endif // !defined(GTEST_DECLARE_bool_) +// Macros for declaring flags. +#define GTEST_DECLARE_bool_(name) \ + namespace testing { \ + GTEST_API_ extern bool GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DECLARE_int32_(name) \ + namespace testing { \ + GTEST_API_ extern std::int32_t GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DECLARE_string_(name) \ + namespace testing { \ + GTEST_API_ extern ::std::string GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") + +#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver + +#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name) +#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value) +#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1 + +#endif // GTEST_HAS_ABSL // Thread annotations #if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) -# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) -# define GTEST_LOCK_EXCLUDED_(locks) +#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) +#define GTEST_LOCK_EXCLUDED_(locks) #endif // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) // Parses 'str' for a 32-bit signed integer. If successful, writes the result @@ -2308,6 +2330,7 @@ namespace testing { namespace internal { template <typename T> using Optional = ::absl::optional<T>; +inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; } } // namespace internal } // namespace testing #else @@ -2321,6 +2344,7 @@ namespace testing { namespace internal { template <typename T> using Optional = ::std::optional<T>; +inline ::std::nullopt_t Nullopt() { return ::std::nullopt; } } // namespace internal } // namespace testing // The case where absl is configured NOT to alias std::optional is not @@ -2332,7 +2356,7 @@ using Optional = ::std::optional<T>; #if GTEST_HAS_ABSL // Always use absl::string_view for Matcher<> specializations if googletest // is built with absl support. -# define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#define GTEST_INTERNAL_HAS_STRING_VIEW 1 #include "absl/strings/string_view.h" namespace testing { namespace internal { @@ -2340,11 +2364,11 @@ using StringView = ::absl::string_view; } // namespace internal } // namespace testing #else -# ifdef __has_include -# if __has_include(<string_view>) && __cplusplus >= 201703L +#ifdef __has_include +#if __has_include(<string_view>) && __cplusplus >= 201703L // Otherwise for C++17 and higher use std::string_view for Matcher<> // specializations. -# define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#define GTEST_INTERNAL_HAS_STRING_VIEW 1 #include <string_view> namespace testing { namespace internal { @@ -2353,8 +2377,8 @@ using StringView = ::std::string_view; } // namespace testing // The case where absl is configured NOT to alias std::string_view is not // supported. -# endif // __has_include(<string_view>) && __cplusplus >= 201703L -# endif // __has_include +#endif // __has_include(<string_view>) && __cplusplus >= 201703L +#endif // __has_include #endif // GTEST_HAS_ABSL #if GTEST_HAS_ABSL diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h index 10f774f96..cca2e1f2a 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h +++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h @@ -26,7 +26,7 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// + // The Google C++ Testing and Mocking Framework (Google Test) // // This header file declares the String class and functions used internally by @@ -36,17 +36,20 @@ // This header file is #included by gtest-internal.h. // It should not be #included by other files. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ #ifdef __BORLANDC__ // string.h is not guaranteed to provide strcpy on C++ Builder. -# include <mem.h> +#include <mem.h> #endif #include <string.h> + #include <cstdint> #include <string> @@ -123,8 +126,7 @@ class GTEST_API_ String { // Unlike strcasecmp(), this function can handle NULL argument(s). // A NULL C string is considered different to any non-NULL C string, // including the empty string. - static bool CaseInsensitiveCStringEquals(const char* lhs, - const char* rhs); + static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs); // Compares two wide C strings, ignoring case. Returns true if and only if // they have the same content. @@ -143,8 +145,8 @@ class GTEST_API_ String { // Returns true if and only if the given string ends with the given suffix, // ignoring case. Any string is considered to end with an empty suffix. - static bool EndsWithCaseInsensitive( - const std::string& str, const std::string& suffix); + static bool EndsWithCaseInsensitive(const std::string& str, + const std::string& suffix); // Formats an int value as "%02d". static std::string FormatIntWidth2(int value); // "%02d" for width == 2 @@ -163,7 +165,7 @@ class GTEST_API_ String { private: String(); // Not meant to be instantiated. -}; // class String +}; // class String // Gets the content of the stringstream's buffer as an std::string. Each '\0' // character in the buffer is replaced with "\\0". diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h index b87a2e2ca..6bc02a7de 100644 --- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h +++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h @@ -30,7 +30,9 @@ // Type utilities needed for implementing typed and type-parameterized // tests. -// GOOGLETEST_CM0001 DO NOT DELETE +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ @@ -39,11 +41,11 @@ // #ifdef __GNUC__ is too general here. It is possible to use gcc without using // libstdc++ (which is where cxxabi.h comes from). -# if GTEST_HAS_CXXABI_H_ -# include <cxxabi.h> -# elif defined(__HP_aCC) -# include <acxx_demangle.h> -# endif // GTEST_HASH_CXXABI_H_ +#if GTEST_HAS_CXXABI_H_ +#include <cxxabi.h> +#elif defined(__HP_aCC) +#include <acxx_demangle.h> +#endif // GTEST_HASH_CXXABI_H_ namespace testing { namespace internal { @@ -101,7 +103,9 @@ std::string GetTypeName() { // A unique type indicating an empty node struct None {}; -# define GTEST_TEMPLATE_ template <typename T> class +#define GTEST_TEMPLATE_ \ + template <typename T> \ + class // The template "selector" struct TemplateSel<Tmpl> is used to // represent Tmpl, which must be a class template with one type @@ -119,8 +123,7 @@ struct TemplateSel { }; }; -# define GTEST_BIND_(TmplSel, T) \ - TmplSel::template Bind<T>::type +#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_> struct Templates { diff --git a/libvpx/third_party/googletest/src/src/gtest-all.cc b/libvpx/third_party/googletest/src/src/gtest-all.cc index ad292905c..2a70ed88c 100644 --- a/libvpx/third_party/googletest/src/src/gtest-all.cc +++ b/libvpx/third_party/googletest/src/src/gtest-all.cc @@ -38,7 +38,7 @@ #include "gtest/gtest.h" // The following lines pull in the real gtest *.cc files. -#include "src/gtest.cc" +#include "src/gtest-assertion-result.cc" #include "src/gtest-death-test.cc" #include "src/gtest-filepath.cc" #include "src/gtest-matchers.cc" @@ -46,3 +46,4 @@ #include "src/gtest-printers.cc" #include "src/gtest-test-part.cc" #include "src/gtest-typed-test.cc" +#include "src/gtest.cc" diff --git a/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc b/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc new file mode 100644 index 000000000..f1c0b10dc --- /dev/null +++ b/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc @@ -0,0 +1,77 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file defines the AssertionResult type. + +#include "gtest/gtest-assertion-result.h" + +#include <string> +#include <utility> + +#include "gtest/gtest-message.h" + +namespace testing { + +// AssertionResult constructors. +// Used in EXPECT_TRUE/FALSE(assertion_result). +AssertionResult::AssertionResult(const AssertionResult& other) + : success_(other.success_), + message_(other.message_.get() != nullptr + ? new ::std::string(*other.message_) + : static_cast< ::std::string*>(nullptr)) {} + +// Swaps two AssertionResults. +void AssertionResult::swap(AssertionResult& other) { + using std::swap; + swap(success_, other.success_); + swap(message_, other.message_); +} + +// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. +AssertionResult AssertionResult::operator!() const { + AssertionResult negation(!success_); + if (message_.get() != nullptr) negation << *message_; + return negation; +} + +// Makes a successful assertion result. +AssertionResult AssertionSuccess() { return AssertionResult(true); } + +// Makes a failed assertion result. +AssertionResult AssertionFailure() { return AssertionResult(false); } + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << message. +AssertionResult AssertionFailure(const Message& message) { + return AssertionFailure() << message; +} + +} // namespace testing diff --git a/libvpx/third_party/googletest/src/src/gtest-death-test.cc b/libvpx/third_party/googletest/src/src/gtest-death-test.cc index bf4f6331d..e6abc6278 100644 --- a/libvpx/third_party/googletest/src/src/gtest-death-test.cc +++ b/libvpx/third_party/googletest/src/src/gtest-death-test.cc @@ -35,49 +35,49 @@ #include <functional> #include <utility> -#include "gtest/internal/gtest-port.h" #include "gtest/internal/custom/gtest.h" +#include "gtest/internal/gtest-port.h" #if GTEST_HAS_DEATH_TEST -# if GTEST_OS_MAC -# include <crt_externs.h> -# endif // GTEST_OS_MAC - -# include <errno.h> -# include <fcntl.h> -# include <limits.h> - -# if GTEST_OS_LINUX -# include <signal.h> -# endif // GTEST_OS_LINUX - -# include <stdarg.h> - -# if GTEST_OS_WINDOWS -# include <windows.h> -# else -# include <sys/mman.h> -# include <sys/wait.h> -# endif // GTEST_OS_WINDOWS - -# if GTEST_OS_QNX -# include <spawn.h> -# endif // GTEST_OS_QNX - -# if GTEST_OS_FUCHSIA -# include <lib/fdio/fd.h> -# include <lib/fdio/io.h> -# include <lib/fdio/spawn.h> -# include <lib/zx/channel.h> -# include <lib/zx/port.h> -# include <lib/zx/process.h> -# include <lib/zx/socket.h> -# include <zircon/processargs.h> -# include <zircon/syscalls.h> -# include <zircon/syscalls/policy.h> -# include <zircon/syscalls/port.h> -# endif // GTEST_OS_FUCHSIA +#if GTEST_OS_MAC +#include <crt_externs.h> +#endif // GTEST_OS_MAC + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> + +#if GTEST_OS_LINUX +#include <signal.h> +#endif // GTEST_OS_LINUX + +#include <stdarg.h> + +#if GTEST_OS_WINDOWS +#include <windows.h> +#else +#include <sys/mman.h> +#include <sys/wait.h> +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_QNX +#include <spawn.h> +#endif // GTEST_OS_QNX + +#if GTEST_OS_FUCHSIA +#include <lib/fdio/fd.h> +#include <lib/fdio/io.h> +#include <lib/fdio/spawn.h> +#include <lib/zx/channel.h> +#include <lib/zx/port.h> +#include <lib/zx/process.h> +#include <lib/zx/socket.h> +#include <zircon/processargs.h> +#include <zircon/syscalls.h> +#include <zircon/syscalls/policy.h> +#include <zircon/syscalls/port.h> +#endif // GTEST_OS_FUCHSIA #endif // GTEST_HAS_DEATH_TEST @@ -96,9 +96,12 @@ namespace testing { // used internally at Google, is "threadsafe". static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE; +} // namespace testing + GTEST_DEFINE_string_( death_test_style, - internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle), + testing::internal::StringFromGTestEnv("death_test_style", + testing::kDefaultDeathTestStyle), "Indicates how to run a death test in a forked child process: " "\"threadsafe\" (child process re-executes the test binary " "from the beginning, running only the specific death test) or " @@ -107,7 +110,7 @@ GTEST_DEFINE_string_( GTEST_DEFINE_bool_( death_test_use_fork, - internal::BoolFromGTestEnv("death_test_use_fork", false), + testing::internal::BoolFromGTestEnv("death_test_use_fork", false), "Instructs to use fork()/_exit() instead of clone() in death tests. " "Ignored and always uses fork() on POSIX systems where clone() is not " "implemented. Useful when running under valgrind or similar tools if " @@ -117,7 +120,6 @@ GTEST_DEFINE_bool_( "work in 99% of the cases. Once valgrind is fixed, this flag will " "most likely be removed."); -namespace internal { GTEST_DEFINE_string_( internal_run_death_test, "", "Indicates the file, line number, temporal index of " @@ -126,7 +128,8 @@ GTEST_DEFINE_string_( "the '|' characters. This flag is specified if and only if the " "current process is a sub-process launched for running a thread-safe " "death test. FOR INTERNAL USE ONLY."); -} // namespace internal + +namespace testing { #if GTEST_HAS_DEATH_TEST @@ -134,9 +137,9 @@ namespace internal { // Valid only for fast death tests. Indicates the code is running in the // child process of a fast style death test. -# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA static bool g_in_fast_death_test_child = false; -# endif +#endif // Returns a Boolean value indicating whether the caller is currently // executing in the context of the death test child process. Tools such as @@ -144,16 +147,16 @@ static bool g_in_fast_death_test_child = false; // tests. IMPORTANT: This is an internal utility. Using it may break the // implementation of death tests. User code MUST NOT use it. bool InDeathTestChild() { -# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA // On Windows and Fuchsia, death tests are thread-safe regardless of the value // of the death_test_style flag. - return !GTEST_FLAG(internal_run_death_test).empty(); + return !GTEST_FLAG_GET(internal_run_death_test).empty(); -# else +#else - if (GTEST_FLAG(death_test_style) == "threadsafe") - return !GTEST_FLAG(internal_run_death_test).empty(); + if (GTEST_FLAG_GET(death_test_style) == "threadsafe") + return !GTEST_FLAG_GET(internal_run_death_test).empty(); else return g_in_fast_death_test_child; #endif @@ -162,40 +165,38 @@ bool InDeathTestChild() { } // namespace internal // ExitedWithCode constructor. -ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) { -} +ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {} // ExitedWithCode function-call operator. bool ExitedWithCode::operator()(int exit_status) const { -# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA return exit_status == exit_code_; -# else +#else return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; -# endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA } -# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // KilledBySignal constructor. -KilledBySignal::KilledBySignal(int signum) : signum_(signum) { -} +KilledBySignal::KilledBySignal(int signum) : signum_(signum) {} // KilledBySignal function-call operator. bool KilledBySignal::operator()(int exit_status) const { -# if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) +#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) { bool result; if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) { return result; } } -# endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) +#endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; } -# endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA namespace internal { @@ -206,23 +207,23 @@ namespace internal { static std::string ExitSummary(int exit_code) { Message m; -# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA m << "Exited with exit status " << exit_code; -# else +#else if (WIFEXITED(exit_code)) { m << "Exited with exit status " << WEXITSTATUS(exit_code); } else if (WIFSIGNALED(exit_code)) { m << "Terminated by signal " << WTERMSIG(exit_code); } -# ifdef WCOREDUMP +#ifdef WCOREDUMP if (WCOREDUMP(exit_code)) { m << " (core dumped)"; } -# endif -# endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +#endif +#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA return m.GetString(); } @@ -233,7 +234,7 @@ bool ExitedUnsuccessfully(int exit_status) { return !ExitedWithCode(0)(exit_status); } -# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // Generates a textual failure message when a death test finds more than // one thread running, or cannot determine the number of threads, prior // to executing the given statement. It is the responsibility of the @@ -254,7 +255,7 @@ static std::string DeathTestThreadWarning(size_t thread_count) { << " this is the last message you see before your test times out."; return msg.GetString(); } -# endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA // Flag characters for reporting a death test that did not die. static const char kDeathTestLived = 'L'; @@ -304,14 +305,14 @@ static void DeathTestAbort(const std::string& message) { // A replacement for CHECK that calls DeathTestAbort if the assertion // fails. -# define GTEST_DEATH_TEST_CHECK_(expression) \ - do { \ - if (!::testing::internal::IsTrue(expression)) { \ - DeathTestAbort( \ - ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ - + ::testing::internal::StreamableToString(__LINE__) + ": " \ - + #expression); \ - } \ +#define GTEST_DEATH_TEST_CHECK_(expression) \ + do { \ + if (!::testing::internal::IsTrue(expression)) { \ + DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \ + ", line " + \ + ::testing::internal::StreamableToString(__LINE__) + \ + ": " + #expression); \ + } \ } while (::testing::internal::AlwaysFalse()) // This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for @@ -321,23 +322,23 @@ static void DeathTestAbort(const std::string& message) { // evaluates the expression as long as it evaluates to -1 and sets // errno to EINTR. If the expression evaluates to -1 but errno is // something other than EINTR, DeathTestAbort is called. -# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ - do { \ - int gtest_retval; \ - do { \ - gtest_retval = (expression); \ - } while (gtest_retval == -1 && errno == EINTR); \ - if (gtest_retval == -1) { \ - DeathTestAbort( \ - ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ - + ::testing::internal::StreamableToString(__LINE__) + ": " \ - + #expression + " != -1"); \ - } \ +#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ + do { \ + int gtest_retval; \ + do { \ + gtest_retval = (expression); \ + } while (gtest_retval == -1 && errno == EINTR); \ + if (gtest_retval == -1) { \ + DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \ + ", line " + \ + ::testing::internal::StreamableToString(__LINE__) + \ + ": " + #expression + " != -1"); \ + } \ } while (::testing::internal::AlwaysFalse()) // Returns the message describing the last system error in errno. std::string GetLastErrnoDescription() { - return errno == 0 ? "" : posix::StrError(errno); + return errno == 0 ? "" : posix::StrError(errno); } // This is called from a death test parent process to read a failure @@ -370,8 +371,9 @@ static void FailFromInternalError(int fd) { DeathTest::DeathTest() { TestInfo* const info = GetUnitTestImpl()->current_test_info(); if (info == nullptr) { - DeathTestAbort("Cannot run a death test outside of a TEST or " - "TEST_F construct"); + DeathTestAbort( + "Cannot run a death test outside of a TEST or " + "TEST_F construct"); } } @@ -500,9 +502,7 @@ void DeathTestImpl::ReadAndInterpretStatusByte() { set_read_fd(-1); } -std::string DeathTestImpl::GetErrorLogs() { - return GetCapturedStderr(); -} +std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); } // Signals that the death test code which should have exited, didn't. // Should be called only in a death test child process. @@ -512,9 +512,9 @@ void DeathTestImpl::Abort(AbortReason reason) { // The parent process considers the death test to be a failure if // it finds any data in our pipe. So, here we write a single flag byte // to the pipe, then exit. - const char status_ch = - reason == TEST_DID_NOT_DIE ? kDeathTestLived : - reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned; + const char status_ch = reason == TEST_DID_NOT_DIE ? kDeathTestLived + : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew + : kDeathTestReturned; GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1)); // We are leaking the descriptor here because on some platforms (i.e., @@ -533,7 +533,7 @@ void DeathTestImpl::Abort(AbortReason reason) { // much easier. static ::std::string FormatDeathTestOutput(const ::std::string& output) { ::std::string ret; - for (size_t at = 0; ; ) { + for (size_t at = 0;;) { const size_t line_end = output.find('\n', at); ret += "[ DEATH ] "; if (line_end == ::std::string::npos) { @@ -568,8 +568,7 @@ static ::std::string FormatDeathTestOutput(const ::std::string& output) { // the first failing condition, in the order given above, is the one that is // reported. Also sets the last death test message string. bool DeathTestImpl::Passed(bool status_ok) { - if (!spawned()) - return false; + if (!spawned()) return false; const std::string error_message = GetErrorLogs(); @@ -580,15 +579,18 @@ bool DeathTestImpl::Passed(bool status_ok) { switch (outcome()) { case LIVED: buffer << " Result: failed to die.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); + << " Error msg:\n" + << FormatDeathTestOutput(error_message); break; case THREW: buffer << " Result: threw an exception.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); + << " Error msg:\n" + << FormatDeathTestOutput(error_message); break; case RETURNED: buffer << " Result: illegal return in test statement.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); + << " Error msg:\n" + << FormatDeathTestOutput(error_message); break; case DIED: if (status_ok) { @@ -605,7 +607,8 @@ bool DeathTestImpl::Passed(bool status_ok) { } else { buffer << " Result: died but not with expected exit code:\n" << " " << ExitSummary(status()) << "\n" - << "Actual msg:\n" << FormatDeathTestOutput(error_message); + << "Actual msg:\n" + << FormatDeathTestOutput(error_message); } break; case IN_PROGRESS: @@ -618,7 +621,7 @@ bool DeathTestImpl::Passed(bool status_ok) { return success; } -# if GTEST_OS_WINDOWS +#if GTEST_OS_WINDOWS // WindowsDeathTest implements death tests on Windows. Due to the // specifics of starting new processes on Windows, death tests there are // always threadsafe, and Google Test considers the @@ -679,14 +682,12 @@ class WindowsDeathTest : public DeathTestImpl { // status, or 0 if no child process exists. As a side effect, sets the // outcome data member. int WindowsDeathTest::Wait() { - if (!spawned()) - return 0; + if (!spawned()) return 0; // Wait until the child either signals that it has acquired the write end // of the pipe or it dies. - const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() }; - switch (::WaitForMultipleObjects(2, - wait_handles, + const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()}; + switch (::WaitForMultipleObjects(2, wait_handles, FALSE, // Waits for any of the handles. INFINITE)) { case WAIT_OBJECT_0: @@ -707,9 +708,8 @@ int WindowsDeathTest::Wait() { // returns immediately if the child has already exited, regardless of // whether previous calls to WaitForMultipleObjects synchronized on this // handle or not. - GTEST_DEATH_TEST_CHECK_( - WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(), - INFINITE)); + GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 == + ::WaitForSingleObject(child_handle_.Get(), INFINITE)); DWORD status_code; GTEST_DEATH_TEST_CHECK_( ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE); @@ -742,12 +742,12 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES), nullptr, TRUE}; HANDLE read_handle, write_handle; - GTEST_DEATH_TEST_CHECK_( - ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable, - 0) // Default buffer size. - != FALSE); - set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), - O_RDONLY)); + GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle, + &handles_are_inheritable, + 0) // Default buffer size. + != FALSE); + set_read_fd( + ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY)); write_handle_.Reset(write_handle); event_handle_.Reset(::CreateEvent( &handles_are_inheritable, @@ -756,27 +756,26 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { nullptr)); // The even is unnamed. GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr); const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + - kFilterFlag + "=" + info->test_suite_name() + - "." + info->name(); + "filter=" + info->test_suite_name() + "." + + info->name(); const std::string internal_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + - "=" + file_ + "|" + StreamableToString(line_) + "|" + - StreamableToString(death_test_index) + "|" + + std::string("--") + GTEST_FLAG_PREFIX_ + + "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) + + "|" + StreamableToString(death_test_index) + "|" + StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) + // size_t has the same width as pointers on both 32-bit and 64-bit // Windows platforms. // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx. - "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + - "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get())); + "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" + + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get())); char executable_path[_MAX_PATH + 1]; // NOLINT GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr, executable_path, _MAX_PATH)); - std::string command_line = - std::string(::GetCommandLineA()) + " " + filter_flag + " \"" + - internal_flag + "\""; + std::string command_line = std::string(::GetCommandLineA()) + " " + + filter_flag + " \"" + internal_flag + "\""; DeathTest::set_last_death_test_message(""); @@ -796,8 +795,8 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { GTEST_DEATH_TEST_CHECK_( ::CreateProcessA( executable_path, const_cast<char*>(command_line.c_str()), - nullptr, // Retuned process handle is not inheritable. - nullptr, // Retuned thread handle is not inheritable. + nullptr, // Returned process handle is not inheritable. + nullptr, // Returned thread handle is not inheritable. TRUE, // Child inherits all inheritable handles (for write_handle_). 0x0, // Default creation flags. nullptr, // Inherit the parent's environment. @@ -809,7 +808,7 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() { return OVERSEE_TEST; } -# elif GTEST_OS_FUCHSIA +#elif GTEST_OS_FUCHSIA class FuchsiaDeathTest : public DeathTestImpl { public: @@ -855,18 +854,13 @@ class Arguments { template <typename Str> void AddArguments(const ::std::vector<Str>& arguments) { for (typename ::std::vector<Str>::const_iterator i = arguments.begin(); - i != arguments.end(); - ++i) { + i != arguments.end(); ++i) { args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); } } - char* const* Argv() { - return &args_[0]; - } + char* const* Argv() { return &args_[0]; } - int size() { - return static_cast<int>(args_.size()) - 1; - } + int size() { return static_cast<int>(args_.size()) - 1; } private: std::vector<char*> args_; @@ -880,8 +874,7 @@ int FuchsiaDeathTest::Wait() { const int kSocketKey = 1; const int kExceptionKey = 2; - if (!spawned()) - return 0; + if (!spawned()) return 0; // Create a port to wait for socket/task/exception events. zx_status_t status_zx; @@ -890,8 +883,8 @@ int FuchsiaDeathTest::Wait() { GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); // Register to wait for the child process to terminate. - status_zx = child_process_.wait_async( - port, kProcessKey, ZX_PROCESS_TERMINATED, 0); + status_zx = + child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); // Register to wait for the socket to be readable or closed. @@ -900,8 +893,8 @@ int FuchsiaDeathTest::Wait() { GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); // Register to wait for an exception. - status_zx = exception_channel_.wait_async( - port, kExceptionKey, ZX_CHANNEL_READABLE, 0); + status_zx = exception_channel_.wait_async(port, kExceptionKey, + ZX_CHANNEL_READABLE, 0); GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); bool process_terminated = false; @@ -931,9 +924,9 @@ int FuchsiaDeathTest::Wait() { size_t old_length = captured_stderr_.length(); size_t bytes_read = 0; captured_stderr_.resize(old_length + kBufferSize); - status_zx = stderr_socket_.read( - 0, &captured_stderr_.front() + old_length, kBufferSize, - &bytes_read); + status_zx = + stderr_socket_.read(0, &captured_stderr_.front() + old_length, + kBufferSize, &bytes_read); captured_stderr_.resize(old_length + bytes_read); } while (status_zx == ZX_OK); if (status_zx == ZX_ERR_PEER_CLOSED) { @@ -987,13 +980,12 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { // Build the child process command line. const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + - kFilterFlag + "=" + info->test_suite_name() + - "." + info->name(); - const std::string internal_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" - + file_ + "|" - + StreamableToString(line_) + "|" - + StreamableToString(death_test_index); + "filter=" + info->test_suite_name() + "." + + info->name(); + const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kInternalRunDeathTestFlag + "=" + file_ + + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index); Arguments args; args.AddArguments(GetInjectableArgvs()); args.AddArgument(filter_flag.c_str()); @@ -1016,8 +1008,7 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { // Create a socket pair will be used to receive the child process' stderr. zx::socket stderr_producer_socket; - status = - zx::socket::create(0, &stderr_producer_socket, &stderr_socket_); + status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_); GTEST_DEATH_TEST_CHECK_(status >= 0); int stderr_producer_fd = -1; status = @@ -1034,35 +1025,32 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { // Create a child job. zx_handle_t child_job = ZX_HANDLE_INVALID; - status = zx_job_create(zx_job_default(), 0, & child_job); + status = zx_job_create(zx_job_default(), 0, &child_job); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); zx_policy_basic_t policy; policy.condition = ZX_POL_NEW_ANY; policy.policy = ZX_POL_ACTION_ALLOW; - status = zx_job_set_policy( - child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1); + status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, + &policy, 1); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); // Create an exception channel attached to the |child_job|, to allow // us to suppress the system default exception handler from firing. - status = - zx_task_create_exception_channel( - child_job, 0, exception_channel_.reset_and_get_address()); + status = zx_task_create_exception_channel( + child_job, 0, exception_channel_.reset_and_get_address()); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); // Spawn the child process. - status = fdio_spawn_etc( - child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr, - 2, spawn_actions, child_process_.reset_and_get_address(), nullptr); + status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], + args.Argv(), nullptr, 2, spawn_actions, + child_process_.reset_and_get_address(), nullptr); GTEST_DEATH_TEST_CHECK_(status == ZX_OK); set_spawned(true); return OVERSEE_TEST; } -std::string FuchsiaDeathTest::GetErrorLogs() { - return captured_stderr_; -} +std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; } #else // We are neither on Windows, nor on Fuchsia. @@ -1093,8 +1081,7 @@ ForkingDeathTest::ForkingDeathTest(const char* a_statement, // status, or 0 if no child process exists. As a side effect, sets the // outcome data member. int ForkingDeathTest::Wait() { - if (!spawned()) - return 0; + if (!spawned()) return 0; ReadAndInterpretStatusByte(); @@ -1173,11 +1160,11 @@ class ExecDeathTest : public ForkingDeathTest { private: static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() { ::std::vector<std::string> args = GetInjectableArgvs(); -# if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) +#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) ::std::vector<std::string> extra_args = GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_(); args.insert(args.end(), extra_args.begin(), extra_args.end()); -# endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) +#endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) return args; } // The name of the file in which the death test is located. @@ -1204,14 +1191,11 @@ class Arguments { template <typename Str> void AddArguments(const ::std::vector<Str>& arguments) { for (typename ::std::vector<Str>::const_iterator i = arguments.begin(); - i != arguments.end(); - ++i) { + i != arguments.end(); ++i) { args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); } } - char* const* Argv() { - return &args_[0]; - } + char* const* Argv() { return &args_[0]; } private: std::vector<char*> args_; @@ -1224,9 +1208,9 @@ struct ExecDeathTestArgs { int close_fd; // File descriptor to close; the read end of a pipe }; -# if GTEST_OS_QNX +#if GTEST_OS_QNX extern "C" char** environ; -# else // GTEST_OS_QNX +#else // GTEST_OS_QNX // The main function for a threadsafe-style death test child process. // This function is called in a clone()-ed process and thus must avoid // any potentially unsafe operations like malloc or libc functions. @@ -1241,8 +1225,8 @@ static int ExecDeathTestChildMain(void* child_arg) { UnitTest::GetInstance()->original_working_dir(); // We can safely call chdir() as it's a direct system call. if (chdir(original_dir) != 0) { - DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + - GetLastErrnoDescription()); + DeathTestAbort(std::string("chdir(\"") + original_dir + + "\") failed: " + GetLastErrnoDescription()); return EXIT_FAILURE; } @@ -1253,13 +1237,12 @@ static int ExecDeathTestChildMain(void* child_arg) { // one path separator. execv(args->argv[0], args->argv); DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " + - original_dir + " failed: " + - GetLastErrnoDescription()); + original_dir + " failed: " + GetLastErrnoDescription()); return EXIT_FAILURE; } -# endif // GTEST_OS_QNX +#endif // GTEST_OS_QNX -# if GTEST_HAS_CLONE +#if GTEST_HAS_CLONE // Two utility routines that together determine the direction the stack // grows. // This could be accomplished more elegantly by a single recursive @@ -1293,7 +1276,7 @@ static bool StackGrowsDown() { StackLowerThanAddress(&dummy, &result); return result; } -# endif // GTEST_HAS_CLONE +#endif // GTEST_HAS_CLONE // Spawns a child process with the same executable as the current process in // a thread-safe manner and instructs it to run the death test. The @@ -1303,10 +1286,10 @@ static bool StackGrowsDown() { // spawn(2) there instead. The function dies with an error message if // anything goes wrong. static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { - ExecDeathTestArgs args = { argv, close_fd }; + ExecDeathTestArgs args = {argv, close_fd}; pid_t child_pid = -1; -# if GTEST_OS_QNX +#if GTEST_OS_QNX // Obtains the current directory and sets it to be closed in the child // process. const int cwd_fd = open(".", O_RDONLY); @@ -1319,16 +1302,16 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { UnitTest::GetInstance()->original_working_dir(); // We can safely call chdir() as it's a direct system call. if (chdir(original_dir) != 0) { - DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + - GetLastErrnoDescription()); + DeathTestAbort(std::string("chdir(\"") + original_dir + + "\") failed: " + GetLastErrnoDescription()); return EXIT_FAILURE; } int fd_flags; // Set close_fd to be closed after spawn. GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD)); - GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD, - fd_flags | FD_CLOEXEC)); + GTEST_DEATH_TEST_CHECK_SYSCALL_( + fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC)); struct inheritance inherit = {0}; // spawn is a system call. child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ); @@ -1336,8 +1319,8 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); -# else // GTEST_OS_QNX -# if GTEST_OS_LINUX +#else // GTEST_OS_QNX +#if GTEST_OS_LINUX // When a SIGPROF signal is received while fork() or clone() are executing, // the process may hang. To avoid this, we ignore SIGPROF here and re-enable // it after the call to fork()/clone() is complete. @@ -1346,12 +1329,12 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action)); sigemptyset(&ignore_sigprof_action.sa_mask); ignore_sigprof_action.sa_handler = SIG_IGN; - GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction( - SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); -# endif // GTEST_OS_LINUX + GTEST_DEATH_TEST_CHECK_SYSCALL_( + sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); +#endif // GTEST_OS_LINUX -# if GTEST_HAS_CLONE - const bool use_fork = GTEST_FLAG(death_test_use_fork); +#if GTEST_HAS_CLONE + const bool use_fork = GTEST_FLAG_GET(death_test_use_fork); if (!use_fork) { static const bool stack_grows_down = StackGrowsDown(); @@ -1370,7 +1353,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { const size_t kMaxStackAlignment = 64; void* const stack_top = static_cast<char*>(stack) + - (stack_grows_down ? stack_size - kMaxStackAlignment : 0); + (stack_grows_down ? stack_size - kMaxStackAlignment : 0); GTEST_DEATH_TEST_CHECK_( static_cast<size_t>(stack_size) > kMaxStackAlignment && reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0); @@ -1379,19 +1362,19 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1); } -# else +#else const bool use_fork = true; -# endif // GTEST_HAS_CLONE +#endif // GTEST_HAS_CLONE if (use_fork && (child_pid = fork()) == 0) { - ExecDeathTestChildMain(&args); - _exit(0); + ExecDeathTestChildMain(&args); + _exit(0); } -# endif // GTEST_OS_QNX -# if GTEST_OS_LINUX +#endif // GTEST_OS_QNX +#if GTEST_OS_LINUX GTEST_DEATH_TEST_CHECK_SYSCALL_( sigaction(SIGPROF, &saved_sigprof_action, nullptr)); -# endif // GTEST_OS_LINUX +#endif // GTEST_OS_LINUX GTEST_DEATH_TEST_CHECK_(child_pid != -1); return child_pid; @@ -1420,13 +1403,13 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() { GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + - kFilterFlag + "=" + info->test_suite_name() + - "." + info->name(); - const std::string internal_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" - + file_ + "|" + StreamableToString(line_) + "|" - + StreamableToString(death_test_index) + "|" - + StreamableToString(pipe_fd[1]); + "filter=" + info->test_suite_name() + "." + + info->name(); + const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + "internal_run_death_test=" + file_ + "|" + + StreamableToString(line_) + "|" + + StreamableToString(death_test_index) + "|" + + StreamableToString(pipe_fd[1]); Arguments args; args.AddArguments(GetArgvsForDeathTestChildProcess()); args.AddArgument(filter_flag.c_str()); @@ -1447,7 +1430,7 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() { return OVERSEE_TEST; } -# endif // !GTEST_OS_WINDOWS +#endif // !GTEST_OS_WINDOWS // Creates a concrete DeathTest-derived class that depends on the // --gtest_death_test_style flag, and sets the pointer pointed to @@ -1461,15 +1444,15 @@ bool DefaultDeathTestFactory::Create(const char* statement, UnitTestImpl* const impl = GetUnitTestImpl(); const InternalRunDeathTestFlag* const flag = impl->internal_run_death_test_flag(); - const int death_test_index = impl->current_test_info() - ->increment_death_test_count(); + const int death_test_index = + impl->current_test_info()->increment_death_test_count(); if (flag != nullptr) { if (death_test_index > flag->index()) { DeathTest::set_last_death_test_message( - "Death test count (" + StreamableToString(death_test_index) - + ") somehow exceeded expected maximum (" - + StreamableToString(flag->index()) + ")"); + "Death test count (" + StreamableToString(death_test_index) + + ") somehow exceeded expected maximum (" + + StreamableToString(flag->index()) + ")"); return false; } @@ -1480,50 +1463,50 @@ bool DefaultDeathTestFactory::Create(const char* statement, } } -# if GTEST_OS_WINDOWS +#if GTEST_OS_WINDOWS - if (GTEST_FLAG(death_test_style) == "threadsafe" || - GTEST_FLAG(death_test_style) == "fast") { + if (GTEST_FLAG_GET(death_test_style) == "threadsafe" || + GTEST_FLAG_GET(death_test_style) == "fast") { *test = new WindowsDeathTest(statement, std::move(matcher), file, line); } -# elif GTEST_OS_FUCHSIA +#elif GTEST_OS_FUCHSIA - if (GTEST_FLAG(death_test_style) == "threadsafe" || - GTEST_FLAG(death_test_style) == "fast") { + if (GTEST_FLAG_GET(death_test_style) == "threadsafe" || + GTEST_FLAG_GET(death_test_style) == "fast") { *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line); } -# else +#else - if (GTEST_FLAG(death_test_style) == "threadsafe") { + if (GTEST_FLAG_GET(death_test_style) == "threadsafe") { *test = new ExecDeathTest(statement, std::move(matcher), file, line); - } else if (GTEST_FLAG(death_test_style) == "fast") { + } else if (GTEST_FLAG_GET(death_test_style) == "fast") { *test = new NoExecDeathTest(statement, std::move(matcher)); } -# endif // GTEST_OS_WINDOWS +#endif // GTEST_OS_WINDOWS else { // NOLINT - this is more readable than unbalanced brackets inside #if. - DeathTest::set_last_death_test_message( - "Unknown death test style \"" + GTEST_FLAG(death_test_style) - + "\" encountered"); + DeathTest::set_last_death_test_message("Unknown death test style \"" + + GTEST_FLAG_GET(death_test_style) + + "\" encountered"); return false; } return true; } -# if GTEST_OS_WINDOWS +#if GTEST_OS_WINDOWS // Recreates the pipe and event handles from the provided parameters, // signals the event, and returns a file descriptor wrapped around the pipe // handle. This function is called in the child process only. static int GetStatusFileDescriptor(unsigned int parent_process_id, - size_t write_handle_as_size_t, - size_t event_handle_as_size_t) { + size_t write_handle_as_size_t, + size_t event_handle_as_size_t) { AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, - FALSE, // Non-inheritable. - parent_process_id)); + FALSE, // Non-inheritable. + parent_process_id)); if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) { DeathTestAbort("Unable to open parent process " + StreamableToString(parent_process_id)); @@ -1531,8 +1514,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id, GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); - const HANDLE write_handle = - reinterpret_cast<HANDLE>(write_handle_as_size_t); + const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t); HANDLE dup_write_handle; // The newly initialized handle is accessible only in the parent @@ -1554,9 +1536,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id, HANDLE dup_event_handle; if (!::DuplicateHandle(parent_process_handle.Get(), event_handle, - ::GetCurrentProcess(), &dup_event_handle, - 0x0, - FALSE, + ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE, DUPLICATE_SAME_ACCESS)) { DeathTestAbort("Unable to duplicate the event handle " + StreamableToString(event_handle_as_size_t) + @@ -1578,61 +1558,57 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id, return write_fd; } -# endif // GTEST_OS_WINDOWS +#endif // GTEST_OS_WINDOWS // Returns a newly created InternalRunDeathTestFlag object with fields // initialized from the GTEST_FLAG(internal_run_death_test) flag if // the flag is specified; otherwise returns NULL. InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { - if (GTEST_FLAG(internal_run_death_test) == "") return nullptr; + if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr; // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we // can use it here. int line = -1; int index = -1; ::std::vector< ::std::string> fields; - SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields); + SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields); int write_fd = -1; -# if GTEST_OS_WINDOWS +#if GTEST_OS_WINDOWS unsigned int parent_process_id = 0; size_t write_handle_as_size_t = 0; size_t event_handle_as_size_t = 0; - if (fields.size() != 6 - || !ParseNaturalNumber(fields[1], &line) - || !ParseNaturalNumber(fields[2], &index) - || !ParseNaturalNumber(fields[3], &parent_process_id) - || !ParseNaturalNumber(fields[4], &write_handle_as_size_t) - || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { + if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index) || + !ParseNaturalNumber(fields[3], &parent_process_id) || + !ParseNaturalNumber(fields[4], &write_handle_as_size_t) || + !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + - GTEST_FLAG(internal_run_death_test)); + GTEST_FLAG_GET(internal_run_death_test)); } - write_fd = GetStatusFileDescriptor(parent_process_id, - write_handle_as_size_t, + write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t, event_handle_as_size_t); -# elif GTEST_OS_FUCHSIA +#elif GTEST_OS_FUCHSIA - if (fields.size() != 3 - || !ParseNaturalNumber(fields[1], &line) - || !ParseNaturalNumber(fields[2], &index)) { - DeathTestAbort("Bad --gtest_internal_run_death_test flag: " - + GTEST_FLAG(internal_run_death_test)); + if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG_GET(internal_run_death_test)); } -# else +#else - if (fields.size() != 4 - || !ParseNaturalNumber(fields[1], &line) - || !ParseNaturalNumber(fields[2], &index) - || !ParseNaturalNumber(fields[3], &write_fd)) { - DeathTestAbort("Bad --gtest_internal_run_death_test flag: " - + GTEST_FLAG(internal_run_death_test)); + if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index) || + !ParseNaturalNumber(fields[3], &write_fd)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG_GET(internal_run_death_test)); } -# endif // GTEST_OS_WINDOWS +#endif // GTEST_OS_WINDOWS return new InternalRunDeathTestFlag(fields[0], line, index, write_fd); } diff --git a/libvpx/third_party/googletest/src/src/gtest-filepath.cc b/libvpx/third_party/googletest/src/src/gtest-filepath.cc index 0b5629401..f6ee90cdb 100644 --- a/libvpx/third_party/googletest/src/src/gtest-filepath.cc +++ b/libvpx/third_party/googletest/src/src/gtest-filepath.cc @@ -30,29 +30,31 @@ #include "gtest/internal/gtest-filepath.h" #include <stdlib.h> -#include "gtest/internal/gtest-port.h" + #include "gtest/gtest-message.h" +#include "gtest/internal/gtest-port.h" #if GTEST_OS_WINDOWS_MOBILE -# include <windows.h> +#include <windows.h> #elif GTEST_OS_WINDOWS -# include <direct.h> -# include <io.h> +#include <direct.h> +#include <io.h> #else -# include <limits.h> -# include <climits> // Some Linux distributions define PATH_MAX here. -#endif // GTEST_OS_WINDOWS_MOBILE +#include <limits.h> + +#include <climits> // Some Linux distributions define PATH_MAX here. +#endif // GTEST_OS_WINDOWS_MOBILE #include "gtest/internal/gtest-string.h" #if GTEST_OS_WINDOWS -# define GTEST_PATH_MAX_ _MAX_PATH +#define GTEST_PATH_MAX_ _MAX_PATH #elif defined(PATH_MAX) -# define GTEST_PATH_MAX_ PATH_MAX +#define GTEST_PATH_MAX_ PATH_MAX #elif defined(_XOPEN_PATH_MAX) -# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX +#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX #else -# define GTEST_PATH_MAX_ _POSIX_PATH_MAX +#define GTEST_PATH_MAX_ _POSIX_PATH_MAX #endif // GTEST_OS_WINDOWS namespace testing { @@ -66,16 +68,16 @@ namespace internal { const char kPathSeparator = '\\'; const char kAlternatePathSeparator = '/'; const char kAlternatePathSeparatorString[] = "/"; -# if GTEST_OS_WINDOWS_MOBILE +#if GTEST_OS_WINDOWS_MOBILE // Windows CE doesn't have a current directory. You should not use // the current directory in tests on Windows CE, but this at least // provides a reasonable fallback. const char kCurrentDirectoryString[] = "\\"; // Windows CE doesn't define INVALID_FILE_ATTRIBUTES const DWORD kInvalidFileAttributes = 0xffffffff; -# else +#else const char kCurrentDirectoryString[] = ".\\"; -# endif // GTEST_OS_WINDOWS_MOBILE +#endif // GTEST_OS_WINDOWS_MOBILE #else const char kPathSeparator = '/'; const char kCurrentDirectoryString[] = "./"; @@ -99,17 +101,17 @@ FilePath FilePath::GetCurrentDir() { // something reasonable. return FilePath(kCurrentDirectoryString); #elif GTEST_OS_WINDOWS - char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + char cwd[GTEST_PATH_MAX_ + 1] = {'\0'}; return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd); #else - char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + char cwd[GTEST_PATH_MAX_ + 1] = {'\0'}; char* result = getcwd(cwd, sizeof(cwd)); -# if GTEST_OS_NACL +#if GTEST_OS_NACL // getcwd will likely fail in NaCl due to the sandbox, so return something // reasonable. The user may have provided a shim implementation for getcwd, // however, so fallback only when failure is detected. return FilePath(result == nullptr ? kCurrentDirectoryString : cwd); -# endif // GTEST_OS_NACL +#endif // GTEST_OS_NACL return FilePath(result == nullptr ? "" : cwd); #endif // GTEST_OS_WINDOWS_MOBILE } @@ -121,8 +123,8 @@ FilePath FilePath::GetCurrentDir() { FilePath FilePath::RemoveExtension(const char* extension) const { const std::string dot_extension = std::string(".") + extension; if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) { - return FilePath(pathname_.substr( - 0, pathname_.length() - dot_extension.length())); + return FilePath( + pathname_.substr(0, pathname_.length() - dot_extension.length())); } return *this; } @@ -178,15 +180,14 @@ FilePath FilePath::RemoveFileName() const { // than zero (e.g., 12), returns "dir/test_12.xml". // On Windows platform, uses \ as the separator rather than /. FilePath FilePath::MakeFileName(const FilePath& directory, - const FilePath& base_name, - int number, + const FilePath& base_name, int number, const char* extension) { std::string file; if (number == 0) { file = base_name.string() + "." + extension; } else { - file = base_name.string() + "_" + StreamableToString(number) - + "." + extension; + file = + base_name.string() + "_" + StreamableToString(number) + "." + extension; } return ConcatPaths(directory, FilePath(file)); } @@ -195,8 +196,7 @@ FilePath FilePath::MakeFileName(const FilePath& directory, // On Windows, uses \ as the separator rather than /. FilePath FilePath::ConcatPaths(const FilePath& directory, const FilePath& relative_path) { - if (directory.IsEmpty()) - return relative_path; + if (directory.IsEmpty()) return relative_path; const FilePath dir(directory.RemoveTrailingPathSeparator()); return FilePath(dir.string() + kPathSeparator + relative_path.string()); } @@ -207,7 +207,7 @@ bool FilePath::FileOrDirectoryExists() const { #if GTEST_OS_WINDOWS_MOBILE LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str()); const DWORD attributes = GetFileAttributes(unicode); - delete [] unicode; + delete[] unicode; return attributes != kInvalidFileAttributes; #else posix::StatStruct file_stat{}; @@ -222,8 +222,8 @@ bool FilePath::DirectoryExists() const { #if GTEST_OS_WINDOWS // Don't strip off trailing separator if path is a root directory on // Windows (like "C:\\"). - const FilePath& path(IsRootDirectory() ? *this : - RemoveTrailingPathSeparator()); + const FilePath& path(IsRootDirectory() ? *this + : RemoveTrailingPathSeparator()); #else const FilePath& path(*this); #endif @@ -231,15 +231,15 @@ bool FilePath::DirectoryExists() const { #if GTEST_OS_WINDOWS_MOBILE LPCWSTR unicode = String::AnsiToUtf16(path.c_str()); const DWORD attributes = GetFileAttributes(unicode); - delete [] unicode; + delete[] unicode; if ((attributes != kInvalidFileAttributes) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { result = true; } #else posix::StatStruct file_stat{}; - result = posix::Stat(path.c_str(), &file_stat) == 0 && - posix::IsDir(file_stat); + result = + posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat); #endif // GTEST_OS_WINDOWS_MOBILE return result; @@ -260,10 +260,9 @@ bool FilePath::IsAbsolutePath() const { const char* const name = pathname_.c_str(); #if GTEST_OS_WINDOWS return pathname_.length() >= 3 && - ((name[0] >= 'a' && name[0] <= 'z') || - (name[0] >= 'A' && name[0] <= 'Z')) && - name[1] == ':' && - IsPathSeparator(name[2]); + ((name[0] >= 'a' && name[0] <= 'z') || + (name[0] >= 'A' && name[0] <= 'Z')) && + name[1] == ':' && IsPathSeparator(name[2]); #else return IsPathSeparator(name[0]); #endif @@ -321,7 +320,7 @@ bool FilePath::CreateFolder() const { FilePath removed_sep(this->RemoveTrailingPathSeparator()); LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); int result = CreateDirectory(unicode, nullptr) ? 0 : -1; - delete [] unicode; + delete[] unicode; #elif GTEST_OS_WINDOWS int result = _mkdir(pathname_.c_str()); #elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA @@ -341,9 +340,8 @@ bool FilePath::CreateFolder() const { // name, otherwise return the name string unmodified. // On Windows platform, uses \ as the separator, other platforms use /. FilePath FilePath::RemoveTrailingPathSeparator() const { - return IsDirectory() - ? FilePath(pathname_.substr(0, pathname_.length() - 1)) - : *this; + return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1)) + : *this; } // Removes any redundant separators that might be in the pathname. diff --git a/libvpx/third_party/googletest/src/src/gtest-internal-inl.h b/libvpx/third_party/googletest/src/src/gtest-internal-inl.h index 6d8cecbbb..0b9e929c6 100644 --- a/libvpx/third_party/googletest/src/src/gtest-internal-inl.h +++ b/libvpx/third_party/googletest/src/src/gtest-internal-inl.h @@ -35,7 +35,7 @@ #define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ #ifndef _WIN32_WCE -# include <errno.h> +#include <errno.h> #endif // !_WIN32_WCE #include <stddef.h> #include <stdlib.h> // For strtoll/_strtoul64/malloc/free. @@ -50,22 +50,20 @@ #include "gtest/internal/gtest-port.h" #if GTEST_CAN_STREAM_RESULTS_ -# include <arpa/inet.h> // NOLINT -# include <netdb.h> // NOLINT +#include <arpa/inet.h> // NOLINT +#include <netdb.h> // NOLINT #endif #if GTEST_OS_WINDOWS -# include <windows.h> // NOLINT -#endif // GTEST_OS_WINDOWS +#include <windows.h> // NOLINT +#endif // GTEST_OS_WINDOWS -#include "gtest/gtest.h" #include "gtest/gtest-spi.h" +#include "gtest/gtest.h" GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ /* class A needs to have dll-interface to be used by clients of class B */) -namespace testing { - // Declares the flags. // // We don't want the users to modify this flag in the code, but want @@ -73,32 +71,13 @@ namespace testing { // declare it here as opposed to in gtest.h. GTEST_DECLARE_bool_(death_test_use_fork); +namespace testing { namespace internal { // The value of GetTestTypeId() as seen from within the Google Test // library. This is solely for testing GetTestTypeId(). GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; -// Names of the flags (needed for parsing Google Test flags). -const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests"; -const char kBreakOnFailureFlag[] = "break_on_failure"; -const char kCatchExceptionsFlag[] = "catch_exceptions"; -const char kColorFlag[] = "color"; -const char kFailFast[] = "fail_fast"; -const char kFilterFlag[] = "filter"; -const char kListTestsFlag[] = "list_tests"; -const char kOutputFlag[] = "output"; -const char kBriefFlag[] = "brief"; -const char kPrintTimeFlag[] = "print_time"; -const char kPrintUTF8Flag[] = "print_utf8"; -const char kRandomSeedFlag[] = "random_seed"; -const char kRepeatFlag[] = "repeat"; -const char kShuffleFlag[] = "shuffle"; -const char kStackTraceDepthFlag[] = "stack_trace_depth"; -const char kStreamResultToFlag[] = "stream_result_to"; -const char kThrowOnFailureFlag[] = "throw_on_failure"; -const char kFlagfileFlag[] = "flagfile"; - // A valid random seed must be in [1, kMaxRandomSeed]. const int kMaxRandomSeed = 99999; @@ -125,21 +104,21 @@ GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms); // // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. -GTEST_API_ bool ParseInt32Flag( - const char* str, const char* flag, int32_t* value); +GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value); // Returns a random seed in range [1, kMaxRandomSeed] based on the // given --gtest_random_seed flag value. inline int GetRandomSeedFromFlag(int32_t random_seed_flag) { - const unsigned int raw_seed = (random_seed_flag == 0) ? - static_cast<unsigned int>(GetTimeInMillis()) : - static_cast<unsigned int>(random_seed_flag); + const unsigned int raw_seed = + (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis()) + : static_cast<unsigned int>(random_seed_flag); // Normalizes the actual seed to range [1, kMaxRandomSeed] such that // it's easy to type. const int normalized_seed = static_cast<int>((raw_seed - 1U) % - static_cast<unsigned int>(kMaxRandomSeed)) + 1; + static_cast<unsigned int>(kMaxRandomSeed)) + + 1; return normalized_seed; } @@ -160,50 +139,54 @@ class GTestFlagSaver { public: // The c'tor. GTestFlagSaver() { - also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests); - break_on_failure_ = GTEST_FLAG(break_on_failure); - catch_exceptions_ = GTEST_FLAG(catch_exceptions); - color_ = GTEST_FLAG(color); - death_test_style_ = GTEST_FLAG(death_test_style); - death_test_use_fork_ = GTEST_FLAG(death_test_use_fork); - fail_fast_ = GTEST_FLAG(fail_fast); - filter_ = GTEST_FLAG(filter); - internal_run_death_test_ = GTEST_FLAG(internal_run_death_test); - list_tests_ = GTEST_FLAG(list_tests); - output_ = GTEST_FLAG(output); - brief_ = GTEST_FLAG(brief); - print_time_ = GTEST_FLAG(print_time); - print_utf8_ = GTEST_FLAG(print_utf8); - random_seed_ = GTEST_FLAG(random_seed); - repeat_ = GTEST_FLAG(repeat); - shuffle_ = GTEST_FLAG(shuffle); - stack_trace_depth_ = GTEST_FLAG(stack_trace_depth); - stream_result_to_ = GTEST_FLAG(stream_result_to); - throw_on_failure_ = GTEST_FLAG(throw_on_failure); + also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests); + break_on_failure_ = GTEST_FLAG_GET(break_on_failure); + catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions); + color_ = GTEST_FLAG_GET(color); + death_test_style_ = GTEST_FLAG_GET(death_test_style); + death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork); + fail_fast_ = GTEST_FLAG_GET(fail_fast); + filter_ = GTEST_FLAG_GET(filter); + internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test); + list_tests_ = GTEST_FLAG_GET(list_tests); + output_ = GTEST_FLAG_GET(output); + brief_ = GTEST_FLAG_GET(brief); + print_time_ = GTEST_FLAG_GET(print_time); + print_utf8_ = GTEST_FLAG_GET(print_utf8); + random_seed_ = GTEST_FLAG_GET(random_seed); + repeat_ = GTEST_FLAG_GET(repeat); + recreate_environments_when_repeating_ = + GTEST_FLAG_GET(recreate_environments_when_repeating); + shuffle_ = GTEST_FLAG_GET(shuffle); + stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth); + stream_result_to_ = GTEST_FLAG_GET(stream_result_to); + throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure); } // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS. ~GTestFlagSaver() { - GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_; - GTEST_FLAG(break_on_failure) = break_on_failure_; - GTEST_FLAG(catch_exceptions) = catch_exceptions_; - GTEST_FLAG(color) = color_; - GTEST_FLAG(death_test_style) = death_test_style_; - GTEST_FLAG(death_test_use_fork) = death_test_use_fork_; - GTEST_FLAG(filter) = filter_; - GTEST_FLAG(fail_fast) = fail_fast_; - GTEST_FLAG(internal_run_death_test) = internal_run_death_test_; - GTEST_FLAG(list_tests) = list_tests_; - GTEST_FLAG(output) = output_; - GTEST_FLAG(brief) = brief_; - GTEST_FLAG(print_time) = print_time_; - GTEST_FLAG(print_utf8) = print_utf8_; - GTEST_FLAG(random_seed) = random_seed_; - GTEST_FLAG(repeat) = repeat_; - GTEST_FLAG(shuffle) = shuffle_; - GTEST_FLAG(stack_trace_depth) = stack_trace_depth_; - GTEST_FLAG(stream_result_to) = stream_result_to_; - GTEST_FLAG(throw_on_failure) = throw_on_failure_; + GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_); + GTEST_FLAG_SET(break_on_failure, break_on_failure_); + GTEST_FLAG_SET(catch_exceptions, catch_exceptions_); + GTEST_FLAG_SET(color, color_); + GTEST_FLAG_SET(death_test_style, death_test_style_); + GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_); + GTEST_FLAG_SET(filter, filter_); + GTEST_FLAG_SET(fail_fast, fail_fast_); + GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_); + GTEST_FLAG_SET(list_tests, list_tests_); + GTEST_FLAG_SET(output, output_); + GTEST_FLAG_SET(brief, brief_); + GTEST_FLAG_SET(print_time, print_time_); + GTEST_FLAG_SET(print_utf8, print_utf8_); + GTEST_FLAG_SET(random_seed, random_seed_); + GTEST_FLAG_SET(repeat, repeat_); + GTEST_FLAG_SET(recreate_environments_when_repeating, + recreate_environments_when_repeating_); + GTEST_FLAG_SET(shuffle, shuffle_); + GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_); + GTEST_FLAG_SET(stream_result_to, stream_result_to_); + GTEST_FLAG_SET(throw_on_failure, throw_on_failure_); } private: @@ -224,6 +207,7 @@ class GTestFlagSaver { bool print_utf8_; int32_t random_seed_; int32_t repeat_; + bool recreate_environments_when_repeating_; bool shuffle_; int32_t stack_trace_depth_; std::string stream_result_to_; @@ -278,8 +262,8 @@ GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val); // returns true if and only if the test should be run on this shard. The test id // is some arbitrary but unique non-negative integer assigned to each test // method. Assumes that 0 <= shard_index < total_shards. -GTEST_API_ bool ShouldRunTestOnShard( - int total_shards, int shard_index, int test_id); +GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index, + int test_id); // STL container utilities. @@ -290,9 +274,8 @@ inline int CountIf(const Container& c, Predicate predicate) { // Implemented as an explicit loop since std::count_if() in libCstd on // Solaris has a non-standard signature. int count = 0; - for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) { - if (predicate(*it)) - ++count; + for (auto it = c.begin(); it != c.end(); ++it) { + if (predicate(*it)) ++count; } return count; } @@ -441,7 +424,9 @@ class OsStackTraceGetterInterface { static const char* const kElidedFramesMarker; private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface); + OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete; + OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) = + delete; }; // A working implementation of the OsStackTraceGetterInterface interface. @@ -463,7 +448,8 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface { void* caller_frame_ = nullptr; #endif // GTEST_HAS_ABSL - GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter); + OsStackTraceGetter(const OsStackTraceGetter&) = delete; + OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete; }; // Information about a Google Test trace point. @@ -476,7 +462,7 @@ struct TraceInfo { // This is the default global test part result reporter used in UnitTestImpl. // This class should only be used by UnitTestImpl. class DefaultGlobalTestPartResultReporter - : public TestPartResultReporterInterface { + : public TestPartResultReporterInterface { public: explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test); // Implements the TestPartResultReporterInterface. Reports the test part @@ -486,7 +472,10 @@ class DefaultGlobalTestPartResultReporter private: UnitTestImpl* const unit_test_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter); + DefaultGlobalTestPartResultReporter( + const DefaultGlobalTestPartResultReporter&) = delete; + DefaultGlobalTestPartResultReporter& operator=( + const DefaultGlobalTestPartResultReporter&) = delete; }; // This is the default per thread test part result reporter used in @@ -502,7 +491,10 @@ class DefaultPerThreadTestPartResultReporter private: UnitTestImpl* const unit_test_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter); + DefaultPerThreadTestPartResultReporter( + const DefaultPerThreadTestPartResultReporter&) = delete; + DefaultPerThreadTestPartResultReporter& operator=( + const DefaultPerThreadTestPartResultReporter&) = delete; }; // The private implementation of the UnitTest class. We don't protect @@ -640,7 +632,8 @@ class GTEST_API_ UnitTestImpl { // For example, if Foo() calls Bar(), which in turn calls // CurrentOsStackTraceExceptTop(1), Foo() will be included in the // trace but Bar() and CurrentOsStackTraceExceptTop() won't. - std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_; + std::string CurrentOsStackTraceExceptTop(int skip_count) + GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_; // Finds and returns a TestSuite with the given name. If one doesn't // exist, creates one and returns it. @@ -744,9 +737,7 @@ class GTEST_API_ UnitTestImpl { } // Clears the results of ad-hoc test assertions. - void ClearAdHocTestResult() { - ad_hoc_test_result_.Clear(); - } + void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); } // Adds a TestProperty to the current TestResult object when invoked in a // context of a test or a test suite, or to the global property set. If the @@ -754,10 +745,7 @@ class GTEST_API_ UnitTestImpl { // updated. void RecordProperty(const TestProperty& test_property); - enum ReactionToSharding { - HONOR_SHARDING_PROTOCOL, - IGNORE_SHARDING_PROTOCOL - }; + enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL }; // Matches the full name of each test against the user-specified // filter to decide whether the test should run, then records the @@ -963,7 +951,8 @@ class GTEST_API_ UnitTestImpl { // starts. bool catch_exceptions_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl); + UnitTestImpl(const UnitTestImpl&) = delete; + UnitTestImpl& operator=(const UnitTestImpl&) = delete; }; // class UnitTestImpl // Convenience function for accessing the global UnitTest @@ -986,8 +975,9 @@ GTEST_API_ bool IsValidEscape(char ch); GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch); GTEST_API_ bool ValidateRegex(const char* regex); GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str); -GTEST_API_ bool MatchRepetitionAndRegexAtHead( - bool escaped, char ch, char repeat, const char* regex, const char* str); +GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch, + char repeat, const char* regex, + const char* str); GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str); #endif // GTEST_USES_SIMPLE_RE @@ -1089,8 +1079,7 @@ class StreamingListener : public EmptyTestEventListener { } ~SocketWriter() override { - if (sockfd_ != -1) - CloseConnection(); + if (sockfd_ != -1) CloseConnection(); } // Sends a string to the socket. @@ -1100,9 +1089,8 @@ class StreamingListener : public EmptyTestEventListener { const auto len = static_cast<size_t>(message.length()); if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) { - GTEST_LOG_(WARNING) - << "stream_result_to: failed to stream to " - << host_name_ << ":" << port_num_; + GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to " + << host_name_ << ":" << port_num_; } } @@ -1123,7 +1111,8 @@ class StreamingListener : public EmptyTestEventListener { const std::string host_name_; const std::string port_num_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter); + SocketWriter(const SocketWriter&) = delete; + SocketWriter& operator=(const SocketWriter&) = delete; }; // class SocketWriter // Escapes '=', '&', '%', and '\n' characters in str as "%xx". @@ -1135,7 +1124,9 @@ class StreamingListener : public EmptyTestEventListener { } explicit StreamingListener(AbstractSocketWriter* socket_writer) - : socket_writer_(socket_writer) { Start(); } + : socket_writer_(socket_writer) { + Start(); + } void OnTestProgramStart(const UnitTest& /* unit_test */) override { SendLn("event=TestProgramStart"); @@ -1158,22 +1149,22 @@ class StreamingListener : public EmptyTestEventListener { void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) override { - SendLn("event=TestIterationEnd&passed=" + - FormatBool(unit_test.Passed()) + "&elapsed_time=" + - StreamableToString(unit_test.elapsed_time()) + "ms"); + SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) + + "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) + + "ms"); } // Note that "event=TestCaseStart" is a wire format and has to remain // "case" for compatibility - void OnTestCaseStart(const TestCase& test_case) override { - SendLn(std::string("event=TestCaseStart&name=") + test_case.name()); + void OnTestSuiteStart(const TestSuite& test_suite) override { + SendLn(std::string("event=TestCaseStart&name=") + test_suite.name()); } // Note that "event=TestCaseEnd" is a wire format and has to remain // "case" for compatibility - void OnTestCaseEnd(const TestCase& test_case) override { - SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) + - "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) + + void OnTestSuiteEnd(const TestSuite& test_suite) override { + SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) + + "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) + "ms"); } @@ -1183,8 +1174,7 @@ class StreamingListener : public EmptyTestEventListener { void OnTestEnd(const TestInfo& test_info) override { SendLn("event=TestEnd&passed=" + - FormatBool((test_info.result())->Passed()) + - "&elapsed_time=" + + FormatBool((test_info.result())->Passed()) + "&elapsed_time=" + StreamableToString((test_info.result())->elapsed_time()) + "ms"); } @@ -1208,7 +1198,8 @@ class StreamingListener : public EmptyTestEventListener { const std::unique_ptr<AbstractSocketWriter> socket_writer_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener); + StreamingListener(const StreamingListener&) = delete; + StreamingListener& operator=(const StreamingListener&) = delete; }; // class StreamingListener #endif // GTEST_CAN_STREAM_RESULTS_ diff --git a/libvpx/third_party/googletest/src/src/gtest-matchers.cc b/libvpx/third_party/googletest/src/src/gtest-matchers.cc index 65104ebab..7e3bcc0cf 100644 --- a/libvpx/third_party/googletest/src/src/gtest-matchers.cc +++ b/libvpx/third_party/googletest/src/src/gtest-matchers.cc @@ -32,12 +32,13 @@ // This file implements just enough of the matcher interface to allow // EXPECT_DEATH and friends to accept a matcher argument. -#include "gtest/internal/gtest-internal.h" -#include "gtest/internal/gtest-port.h" #include "gtest/gtest-matchers.h" #include <string> +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + namespace testing { // Constructs a matcher that matches a const std::string& whose value is diff --git a/libvpx/third_party/googletest/src/src/gtest-port.cc b/libvpx/third_party/googletest/src/src/gtest-port.cc index 53a4d37f9..d797fe4d5 100644 --- a/libvpx/third_party/googletest/src/src/gtest-port.cc +++ b/libvpx/third_party/googletest/src/src/gtest-port.cc @@ -27,61 +27,62 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #include "gtest/internal/gtest-port.h" #include <limits.h> #include <stdio.h> #include <stdlib.h> #include <string.h> + #include <cstdint> #include <fstream> #include <memory> #if GTEST_OS_WINDOWS -# include <windows.h> -# include <io.h> -# include <sys/stat.h> -# include <map> // Used in ThreadLocal. -# ifdef _MSC_VER -# include <crtdbg.h> -# endif // _MSC_VER +#include <io.h> +#include <sys/stat.h> +#include <windows.h> + +#include <map> // Used in ThreadLocal. +#ifdef _MSC_VER +#include <crtdbg.h> +#endif // _MSC_VER #else -# include <unistd.h> +#include <unistd.h> #endif // GTEST_OS_WINDOWS #if GTEST_OS_MAC -# include <mach/mach_init.h> -# include <mach/task.h> -# include <mach/vm_map.h> +#include <mach/mach_init.h> +#include <mach/task.h> +#include <mach/vm_map.h> #endif // GTEST_OS_MAC #if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ GTEST_OS_NETBSD || GTEST_OS_OPENBSD -# include <sys/sysctl.h> -# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD -# include <sys/user.h> -# endif +#include <sys/sysctl.h> +#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD +#include <sys/user.h> +#endif #endif #if GTEST_OS_QNX -# include <devctl.h> -# include <fcntl.h> -# include <sys/procfs.h> +#include <devctl.h> +#include <fcntl.h> +#include <sys/procfs.h> #endif // GTEST_OS_QNX #if GTEST_OS_AIX -# include <procinfo.h> -# include <sys/types.h> +#include <procinfo.h> +#include <sys/types.h> #endif // GTEST_OS_AIX #if GTEST_OS_FUCHSIA -# include <zircon/process.h> -# include <zircon/syscalls.h> +#include <zircon/process.h> +#include <zircon/syscalls.h> #endif // GTEST_OS_FUCHSIA -#include "gtest/gtest-spi.h" #include "gtest/gtest-message.h" +#include "gtest/gtest-spi.h" #include "gtest/internal/gtest-internal.h" #include "gtest/internal/gtest-string.h" #include "src/gtest-internal-inl.h" @@ -89,16 +90,7 @@ namespace testing { namespace internal { -#if defined(_MSC_VER) || defined(__BORLANDC__) -// MSVC and C++Builder do not provide a definition of STDERR_FILENO. -const int kStdOutFileno = 1; -const int kStdErrFileno = 2; -#else -const int kStdOutFileno = STDOUT_FILENO; -const int kStdErrFileno = STDERR_FILENO; -#endif // _MSC_VER - -#if GTEST_OS_LINUX +#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD namespace { template <typename T> @@ -131,8 +123,7 @@ size_t GetThreadCount() { if (status == KERN_SUCCESS) { // task_threads allocates resources in thread_list and we need to free them // to avoid leaks. - vm_deallocate(task, - reinterpret_cast<vm_address_t>(thread_list), + vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list), sizeof(thread_t) * thread_count); return static_cast<size_t>(thread_count); } else { @@ -141,7 +132,7 @@ size_t GetThreadCount() { } #elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ - GTEST_OS_NETBSD + GTEST_OS_NETBSD #if GTEST_OS_NETBSD #undef KERN_PROC @@ -184,12 +175,12 @@ size_t GetThreadCount() { // we cannot detect it. size_t GetThreadCount() { int mib[] = { - CTL_KERN, - KERN_PROC, - KERN_PROC_PID | KERN_PROC_SHOW_THREADS, - getpid(), - sizeof(struct kinfo_proc), - 0, + CTL_KERN, + KERN_PROC, + KERN_PROC_PID | KERN_PROC_SHOW_THREADS, + getpid(), + sizeof(struct kinfo_proc), + 0, }; u_int miblen = sizeof(mib) / sizeof(mib[0]); @@ -210,8 +201,7 @@ size_t GetThreadCount() { // exclude empty members size_t nthreads = 0; for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) { - if (info[i].p_tid != -1) - nthreads++; + if (info[i].p_tid != -1) nthreads++; } return nthreads; } @@ -254,13 +244,9 @@ size_t GetThreadCount() { size_t GetThreadCount() { int dummy_buffer; size_t avail; - zx_status_t status = zx_object_get_info( - zx_process_self(), - ZX_INFO_PROCESS_THREADS, - &dummy_buffer, - 0, - nullptr, - &avail); + zx_status_t status = + zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS, + &dummy_buffer, 0, nullptr, &avail); if (status == ZX_OK) { return avail; } else { @@ -280,27 +266,15 @@ size_t GetThreadCount() { #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS -void SleepMilliseconds(int n) { - ::Sleep(static_cast<DWORD>(n)); -} +AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {} -AutoHandle::AutoHandle() - : handle_(INVALID_HANDLE_VALUE) {} +AutoHandle::AutoHandle(Handle handle) : handle_(handle) {} -AutoHandle::AutoHandle(Handle handle) - : handle_(handle) {} +AutoHandle::~AutoHandle() { Reset(); } -AutoHandle::~AutoHandle() { - Reset(); -} - -AutoHandle::Handle AutoHandle::Get() const { - return handle_; -} +AutoHandle::Handle AutoHandle::Get() const { return handle_; } -void AutoHandle::Reset() { - Reset(INVALID_HANDLE_VALUE); -} +void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); } void AutoHandle::Reset(HANDLE handle) { // Resetting with the same handle we already own is invalid. @@ -312,7 +286,7 @@ void AutoHandle::Reset(HANDLE handle) { } else { GTEST_CHECK_(!IsCloseable()) << "Resetting a valid handle to itself is likely a programmer error " - "and thus not allowed."; + "and thus not allowed."; } } @@ -322,23 +296,6 @@ bool AutoHandle::IsCloseable() const { return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE; } -Notification::Notification() - : event_(::CreateEvent(nullptr, // Default security attributes. - TRUE, // Do not reset automatically. - FALSE, // Initially unset. - nullptr)) { // Anonymous event. - GTEST_CHECK_(event_.Get() != nullptr); -} - -void Notification::Notify() { - GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE); -} - -void Notification::WaitForNotification() { - GTEST_CHECK_( - ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0); -} - Mutex::Mutex() : owner_thread_id_(0), type_(kDynamic), @@ -391,25 +348,25 @@ namespace { // MemoryIsNotDeallocated memory_is_not_deallocated; // critical_section_ = new CRITICAL_SECTION; // -class MemoryIsNotDeallocated -{ +class MemoryIsNotDeallocated { public: MemoryIsNotDeallocated() : old_crtdbg_flag_(0) { old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG); // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT // doesn't report mem leak if there's no matching deallocation. - _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF); + (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF); } ~MemoryIsNotDeallocated() { // Restore the original _CRTDBG_ALLOC_MEM_DF flag - _CrtSetDbgFlag(old_crtdbg_flag_); + (void)_CrtSetDbgFlag(old_crtdbg_flag_); } private: int old_crtdbg_flag_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated); + MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete; + MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete; }; #endif // _MSC_VER @@ -435,15 +392,13 @@ void Mutex::ThreadSafeLazyInit() { ::InitializeCriticalSection(critical_section_); // Updates the critical_section_init_phase_ to 2 to signal // initialization complete. - GTEST_CHECK_(::InterlockedCompareExchange( - &critical_section_init_phase_, 2L, 1L) == - 1L); + GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_, + 2L, 1L) == 1L); break; case 1: // Somebody else is already initializing the mutex; spin until they // are done. - while (::InterlockedCompareExchange(&critical_section_init_phase_, - 2L, + while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L, 2L) != 2L) { // Possibly yields the rest of the thread's time slice to other // threads. @@ -488,9 +443,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase { private: struct ThreadMainParam { ThreadMainParam(Runnable* runnable, Notification* thread_can_start) - : runnable_(runnable), - thread_can_start_(thread_can_start) { - } + : runnable_(runnable), thread_can_start_(thread_can_start) {} std::unique_ptr<Runnable> runnable_; // Does not own. Notification* thread_can_start_; @@ -508,20 +461,18 @@ class ThreadWithParamSupport : public ThreadWithParamBase { // Prohibit instantiation. ThreadWithParamSupport(); - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport); + ThreadWithParamSupport(const ThreadWithParamSupport&) = delete; + ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete; }; } // namespace -ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable, +ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start) - : thread_(ThreadWithParamSupport::CreateThread(runnable, - thread_can_start)) { -} + : thread_( + ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {} -ThreadWithParamBase::~ThreadWithParamBase() { - Join(); -} +ThreadWithParamBase::~ThreadWithParamBase() { Join(); } void ThreadWithParamBase::Join() { GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0) @@ -548,8 +499,10 @@ class ThreadLocalRegistryImpl { ThreadIdToThreadLocals::iterator thread_local_pos = thread_to_thread_locals->find(current_thread); if (thread_local_pos == thread_to_thread_locals->end()) { - thread_local_pos = thread_to_thread_locals->insert( - std::make_pair(current_thread, ThreadLocalValues())).first; + thread_local_pos = + thread_to_thread_locals + ->insert(std::make_pair(current_thread, ThreadLocalValues())) + .first; StartWatcherThreadFor(current_thread); } ThreadLocalValues& thread_local_values = thread_local_pos->second; @@ -577,9 +530,8 @@ class ThreadLocalRegistryImpl { ThreadIdToThreadLocals* const thread_to_thread_locals = GetThreadLocalsMapLocked(); for (ThreadIdToThreadLocals::iterator it = - thread_to_thread_locals->begin(); - it != thread_to_thread_locals->end(); - ++it) { + thread_to_thread_locals->begin(); + it != thread_to_thread_locals->end(); ++it) { ThreadLocalValues& thread_local_values = it->second; ThreadLocalValues::iterator value_pos = thread_local_values.find(thread_local_instance); @@ -609,9 +561,8 @@ class ThreadLocalRegistryImpl { if (thread_local_pos != thread_to_thread_locals->end()) { ThreadLocalValues& thread_local_values = thread_local_pos->second; for (ThreadLocalValues::iterator value_pos = - thread_local_values.begin(); - value_pos != thread_local_values.end(); - ++value_pos) { + thread_local_values.begin(); + value_pos != thread_local_values.end(); ++value_pos) { value_holders.push_back(value_pos->second); } thread_to_thread_locals->erase(thread_local_pos); @@ -637,9 +588,8 @@ class ThreadLocalRegistryImpl { static void StartWatcherThreadFor(DWORD thread_id) { // The returned handle will be kept in thread_map and closed by // watcher_thread in WatcherThreadFunc. - HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, - FALSE, - thread_id); + HANDLE thread = + ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id); GTEST_CHECK_(thread != nullptr); // We need to pass a valid thread ID pointer into CreateThread for it // to work correctly under Win98. @@ -650,7 +600,8 @@ class ThreadLocalRegistryImpl { &ThreadLocalRegistryImpl::WatcherThreadFunc, reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)), CREATE_SUSPENDED, &watcher_thread_id); - GTEST_CHECK_(watcher_thread != nullptr); + GTEST_CHECK_(watcher_thread != nullptr) + << "CreateThread failed with error " << ::GetLastError() << "."; // Give the watcher thread the same priority as ours to avoid being // blocked by it. ::SetThreadPriority(watcher_thread, @@ -664,8 +615,7 @@ class ThreadLocalRegistryImpl { static DWORD WINAPI WatcherThreadFunc(LPVOID param) { const ThreadIdAndHandle* tah = reinterpret_cast<const ThreadIdAndHandle*>(param); - GTEST_CHECK_( - ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0); + GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0); OnThreadExit(tah->first); ::CloseHandle(tah->second); delete tah; @@ -689,16 +639,17 @@ class ThreadLocalRegistryImpl { }; Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex); // NOLINT -Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex); // NOLINT +Mutex ThreadLocalRegistryImpl::thread_map_mutex_( + Mutex::kStaticMutex); // NOLINT ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread( - const ThreadLocalBase* thread_local_instance) { + const ThreadLocalBase* thread_local_instance) { return ThreadLocalRegistryImpl::GetValueOnCurrentThread( thread_local_instance); } void ThreadLocalRegistry::OnThreadLocalDestroyed( - const ThreadLocalBase* thread_local_instance) { + const ThreadLocalBase* thread_local_instance) { ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance); } @@ -786,7 +737,7 @@ bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); } bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); } bool IsAsciiWordChar(char ch) { return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || - ('0' <= ch && ch <= '9') || ch == '_'; + ('0' <= ch && ch <= '9') || ch == '_'; } // Returns true if and only if "\\c" is a supported escape sequence. @@ -799,17 +750,28 @@ bool IsValidEscape(char c) { bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { if (escaped) { // "\\p" where p is pattern_char. switch (pattern_char) { - case 'd': return IsAsciiDigit(ch); - case 'D': return !IsAsciiDigit(ch); - case 'f': return ch == '\f'; - case 'n': return ch == '\n'; - case 'r': return ch == '\r'; - case 's': return IsAsciiWhiteSpace(ch); - case 'S': return !IsAsciiWhiteSpace(ch); - case 't': return ch == '\t'; - case 'v': return ch == '\v'; - case 'w': return IsAsciiWordChar(ch); - case 'W': return !IsAsciiWordChar(ch); + case 'd': + return IsAsciiDigit(ch); + case 'D': + return !IsAsciiDigit(ch); + case 'f': + return ch == '\f'; + case 'n': + return ch == '\n'; + case 'r': + return ch == '\r'; + case 's': + return IsAsciiWhiteSpace(ch); + case 'S': + return !IsAsciiWhiteSpace(ch); + case 't': + return ch == '\t'; + case 'v': + return ch == '\v'; + case 'w': + return IsAsciiWordChar(ch); + case 'W': + return !IsAsciiWordChar(ch); } return IsAsciiPunct(pattern_char) && pattern_char == ch; } @@ -820,7 +782,8 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { // Helper function used by ValidateRegex() to format error messages. static std::string FormatRegexSyntaxError(const char* regex, int index) { return (Message() << "Syntax error at index " << index - << " in simple regular expression \"" << regex << "\": ").GetString(); + << " in simple regular expression \"" << regex << "\": ") + .GetString(); } // Generates non-fatal failures and returns false if regex is invalid; @@ -862,12 +825,12 @@ bool ValidateRegex(const char* regex) { << "'$' can only appear at the end."; is_valid = false; } else if (IsInSet(ch, "()[]{}|")) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'" << ch << "' is unsupported."; + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch + << "' is unsupported."; is_valid = false; } else if (IsRepeat(ch) && !prev_repeatable) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'" << ch << "' can only follow a repeatable token."; + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch + << "' can only follow a repeatable token."; is_valid = false; } @@ -885,12 +848,10 @@ bool ValidateRegex(const char* regex) { // characters to be indexable by size_t, in which case the test will // probably time out anyway. We are fine with this limitation as // std::string has it too. -bool MatchRepetitionAndRegexAtHead( - bool escaped, char c, char repeat, const char* regex, - const char* str) { +bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat, + const char* regex, const char* str) { const size_t min_count = (repeat == '+') ? 1 : 0; - const size_t max_count = (repeat == '?') ? 1 : - static_cast<size_t>(-1) - 1; + const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1; // We cannot call numeric_limits::max() as it conflicts with the // max() macro on Windows. @@ -903,8 +864,7 @@ bool MatchRepetitionAndRegexAtHead( // greedy match. return true; } - if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) - return false; + if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false; } return false; } @@ -918,25 +878,23 @@ bool MatchRegexAtHead(const char* regex, const char* str) { // "$" only matches the end of a string. Note that regex being // valid guarantees that there's nothing after "$" in it. - if (*regex == '$') - return *str == '\0'; + if (*regex == '$') return *str == '\0'; // Is the first thing in regex an escape sequence? const bool escaped = *regex == '\\'; - if (escaped) - ++regex; + if (escaped) ++regex; if (IsRepeat(regex[1])) { // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so // here's an indirect recursion. It terminates as the regex gets // shorter in each recursion. - return MatchRepetitionAndRegexAtHead( - escaped, regex[0], regex[1], regex + 2, str); + return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2, + str); } else { // regex isn't empty, isn't "$", and doesn't start with a // repetition. We match the first atom of regex with the first // character of str and recurse. return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) && - MatchRegexAtHead(regex + 1, str + 1); + MatchRegexAtHead(regex + 1, str + 1); } } @@ -951,13 +909,11 @@ bool MatchRegexAtHead(const char* regex, const char* str) { bool MatchRegexAnywhere(const char* regex, const char* str) { if (regex == nullptr || str == nullptr) return false; - if (*regex == '^') - return MatchRegexAtHead(regex + 1, str); + if (*regex == '^') return MatchRegexAtHead(regex + 1, str); // A successful match can be anywhere in str. do { - if (MatchRegexAtHead(regex, str)) - return true; + if (MatchRegexAtHead(regex, str)) return true; } while (*str++ != '\0'); return false; } @@ -1038,8 +994,8 @@ GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { // FormatFileLocation in order to contrast the two functions. // Note that FormatCompilerIndependentFileLocation() does NOT append colon // to the file location it produces, unlike FormatFileLocation(). -GTEST_API_ ::std::string FormatCompilerIndependentFileLocation( - const char* file, int line) { +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, + int line) { const std::string file_name(file == nullptr ? kUnknownFile : file); if (line < 0) @@ -1050,12 +1006,13 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation( GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line) : severity_(severity) { - const char* const marker = - severity == GTEST_INFO ? "[ INFO ]" : - severity == GTEST_WARNING ? "[WARNING]" : - severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]"; - GetStream() << ::std::endl << marker << " " - << FormatFileLocation(file, line).c_str() << ": "; + const char* const marker = severity == GTEST_INFO ? "[ INFO ]" + : severity == GTEST_WARNING ? "[WARNING]" + : severity == GTEST_ERROR ? "[ ERROR ]" + : "[ FATAL ]"; + GetStream() << ::std::endl + << marker << " " << FormatFileLocation(file, line).c_str() + << ": "; } // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. @@ -1078,27 +1035,26 @@ class CapturedStream { public: // The ctor redirects the stream to a temporary file. explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) { -# if GTEST_OS_WINDOWS - char temp_dir_path[MAX_PATH + 1] = { '\0' }; // NOLINT - char temp_file_path[MAX_PATH + 1] = { '\0' }; // NOLINT +#if GTEST_OS_WINDOWS + char temp_dir_path[MAX_PATH + 1] = {'\0'}; // NOLINT + char temp_file_path[MAX_PATH + 1] = {'\0'}; // NOLINT ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path); - const UINT success = ::GetTempFileNameA(temp_dir_path, - "gtest_redir", + const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir", 0, // Generate unique file name. temp_file_path); GTEST_CHECK_(success != 0) << "Unable to create a temporary file in " << temp_dir_path; const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE); - GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file " - << temp_file_path; + GTEST_CHECK_(captured_fd != -1) + << "Unable to open temporary file " << temp_file_path; filename_ = temp_file_path; -# else +#else // There's no guarantee that a test has write access to the current // directory, so we create the temporary file in a temporary directory. std::string name_template; -# if GTEST_OS_LINUX_ANDROID +#if GTEST_OS_LINUX_ANDROID // Note: Android applications are expected to call the framework's // Context.getExternalStorageDirectory() method through JNI to get // the location of the world-writable SD Card directory. However, @@ -1111,7 +1067,7 @@ class CapturedStream { // '/sdcard' and other variants cannot be relied on, as they are not // guaranteed to be mounted, or may have a delay in mounting. name_template = "/data/local/tmp/"; -# elif GTEST_OS_IOS +#elif GTEST_OS_IOS char user_temp_dir[PATH_MAX + 1]; // Documented alternative to NSTemporaryDirectory() (for obtaining creating @@ -1132,9 +1088,9 @@ class CapturedStream { name_template = user_temp_dir; if (name_template.back() != GTEST_PATH_SEP_[0]) name_template.push_back(GTEST_PATH_SEP_[0]); -# else +#else name_template = "/tmp/"; -# endif +#endif name_template.append("gtest_captured_stream.XXXXXX"); // mkstemp() modifies the string bytes in place, and does not go beyond the @@ -1150,15 +1106,13 @@ class CapturedStream { << " for test; does the test have access to the /tmp directory?"; } filename_ = std::move(name_template); -# endif // GTEST_OS_WINDOWS +#endif // GTEST_OS_WINDOWS fflush(nullptr); dup2(captured_fd, fd_); close(captured_fd); } - ~CapturedStream() { - remove(filename_.c_str()); - } + ~CapturedStream() { remove(filename_.c_str()); } std::string GetCapturedString() { if (uncaptured_fd_ != -1) { @@ -1185,7 +1139,8 @@ class CapturedStream { // Name of the temporary file holding the stderr output. ::std::string filename_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream); + CapturedStream(const CapturedStream&) = delete; + CapturedStream& operator=(const CapturedStream&) = delete; }; GTEST_DISABLE_MSC_DEPRECATED_POP_() @@ -1213,6 +1168,15 @@ static std::string GetCapturedStream(CapturedStream** captured_stream) { return content; } +#if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC and C++Builder do not provide a definition of STDERR_FILENO. +const int kStdOutFileno = 1; +const int kStdErrFileno = 2; +#else +const int kStdOutFileno = STDOUT_FILENO; +const int kStdErrFileno = STDERR_FILENO; +#endif // defined(_MSC_VER) || defined(__BORLANDC__) + // Starts capturing stdout. void CaptureStdout() { CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout); @@ -1235,10 +1199,6 @@ std::string GetCapturedStderr() { #endif // GTEST_HAS_STREAM_REDIRECTION - - - - size_t GetFileSize(FILE* file) { fseek(file, 0, SEEK_END); return static_cast<size_t>(ftell(file)); @@ -1256,7 +1216,8 @@ std::string ReadEntireFile(FILE* file) { // Keeps reading the file until we cannot read further or the // pre-determined file size is reached. do { - bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file); + bytes_last_read = + fread(buffer + bytes_read, 1, file_size - bytes_read, file); bytes_read += bytes_last_read; } while (bytes_last_read > 0 && bytes_read < file_size); @@ -1344,7 +1305,7 @@ bool ParseInt32(const Message& src_text, const char* str, int32_t* value) { // LONG_MAX or LONG_MIN when the input overflows.) result != long_value // The parsed value overflows as an int32_t. - ) { + ) { Message msg; msg << "WARNING: " << src_text << " is expected to be a 32-bit integer, but actually" @@ -1388,8 +1349,8 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) { } int32_t result = default_value; - if (!ParseInt32(Message() << "Environment variable " << env_var, - string_value, &result)) { + if (!ParseInt32(Message() << "Environment variable " << env_var, string_value, + &result)) { printf("The default value %s is used.\n", (Message() << default_value).GetString().c_str()); fflush(stdout); @@ -1408,7 +1369,7 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) { // not check that the flag is 'output' // In essence this checks an env variable called XML_OUTPUT_FILE // and if it is set we prepend "xml:" to its value, if it not set we return "" -std::string OutputFlagAlsoCheckEnvVar(){ +std::string OutputFlagAlsoCheckEnvVar() { std::string default_value_for_output_flag = ""; const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE"); if (nullptr != xml_output_file_env) { diff --git a/libvpx/third_party/googletest/src/src/gtest-printers.cc b/libvpx/third_party/googletest/src/src/gtest-printers.cc index 1b68fcb50..f3976d230 100644 --- a/libvpx/third_party/googletest/src/src/gtest-printers.cc +++ b/libvpx/third_party/googletest/src/src/gtest-printers.cc @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - // Google Test - The Google C++ Testing and Mocking Framework // // This file implements a universal value printer that can print a @@ -101,7 +100,7 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os); *os << " ... "; // Rounds up to 2-byte boundary. - const size_t resume_pos = (count - kChunkSize + 1)/2*2; + const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2; PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os); } *os << ">"; @@ -136,11 +135,7 @@ void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count, // - as is if it's a printable ASCII (e.g. 'a', '2', ' '), // - as a hexadecimal escape sequence (e.g. '\x7F'), or // - as a special escape sequence (e.g. '\r', '\n'). -enum CharFormat { - kAsIs, - kHexEscape, - kSpecialEscape -}; +enum CharFormat { kAsIs, kHexEscape, kSpecialEscape }; // Returns true if c is a printable ASCII character. We test the // value of c directly instead of calling isprint(), which is buggy on @@ -213,35 +208,21 @@ static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) { } } -static const char* GetCharWidthPrefix(char) { - return ""; -} +static const char* GetCharWidthPrefix(char) { return ""; } -static const char* GetCharWidthPrefix(signed char) { - return ""; -} +static const char* GetCharWidthPrefix(signed char) { return ""; } -static const char* GetCharWidthPrefix(unsigned char) { - return ""; -} +static const char* GetCharWidthPrefix(unsigned char) { return ""; } #ifdef __cpp_char8_t -static const char* GetCharWidthPrefix(char8_t) { - return "u8"; -} +static const char* GetCharWidthPrefix(char8_t) { return "u8"; } #endif -static const char* GetCharWidthPrefix(char16_t) { - return "u"; -} +static const char* GetCharWidthPrefix(char16_t) { return "u"; } -static const char* GetCharWidthPrefix(char32_t) { - return "U"; -} +static const char* GetCharWidthPrefix(char32_t) { return "U"; } -static const char* GetCharWidthPrefix(wchar_t) { - return "L"; -} +static const char* GetCharWidthPrefix(wchar_t) { return "L"; } // Prints a char c as if it's part of a string literal, escaping it when // necessary; returns how c was formatted. @@ -276,8 +257,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) { // To aid user debugging, we also print c's code in decimal, unless // it's 0 (in which case c was printed as '\\0', making the code // obvious). - if (c == 0) - return; + if (c == 0) return; *os << " (" << static_cast<int>(c); // For more convenience, we print c's code again in hexadecimal, @@ -304,17 +284,60 @@ void PrintTo(char32_t c, ::std::ostream* os) { << static_cast<uint32_t>(c); } +// gcc/clang __{u,}int128_t +#if defined(__SIZEOF_INT128__) +void PrintTo(__uint128_t v, ::std::ostream* os) { + if (v == 0) { + *os << "0"; + return; + } + + // Buffer large enough for ceil(log10(2^128))==39 and the null terminator + char buf[40]; + char* p = buf + sizeof(buf); + + // Some configurations have a __uint128_t, but no support for built in + // division. Do manual long division instead. + + uint64_t high = static_cast<uint64_t>(v >> 64); + uint64_t low = static_cast<uint64_t>(v); + + *--p = 0; + while (high != 0 || low != 0) { + uint64_t high_mod = high % 10; + high = high / 10; + // This is the long division algorithm specialized for a divisor of 10 and + // only two elements. + // Notable values: + // 2^64 / 10 == 1844674407370955161 + // 2^64 % 10 == 6 + const uint64_t carry = 6 * high_mod + low % 10; + low = low / 10 + high_mod * 1844674407370955161 + carry / 10; + + char digit = static_cast<char>(carry % 10); + *--p = '0' + digit; + } + *os << p; +} +void PrintTo(__int128_t v, ::std::ostream* os) { + __uint128_t uv = static_cast<__uint128_t>(v); + if (v < 0) { + *os << "-"; + uv = -uv; + } + PrintTo(uv, os); +} +#endif // __SIZEOF_INT128__ + // Prints the given array of characters to the ostream. CharType must be either // char, char8_t, char16_t, char32_t, or wchar_t. // The array starts at begin, the length is len, it may include '\0' characters // and may not be NUL-terminated. template <typename CharType> -GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ -GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ -GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ -GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ -static CharFormat PrintCharsAsStringTo( - const CharType* begin, size_t len, ostream* os) { +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat + PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) { const char* const quote_prefix = GetCharWidthPrefix(*begin); *os << quote_prefix << "\""; bool is_previous_hex = false; @@ -340,12 +363,11 @@ static CharFormat PrintCharsAsStringTo( // Prints a (const) char/wchar_t array of 'len' elements, starting at address // 'begin'. CharType must be either char or wchar_t. template <typename CharType> -GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ -GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ -GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ -GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ -static void UniversalPrintCharArray( - const CharType* begin, size_t len, ostream* os) { +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void + UniversalPrintCharArray(const CharType* begin, size_t len, + ostream* os) { // The code // const char kFoo[] = "foo"; // generates an array of 4, not 3, elements, with the last one being '\0'. @@ -436,28 +458,28 @@ void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); } namespace { bool ContainsUnprintableControlCodes(const char* str, size_t length) { - const unsigned char *s = reinterpret_cast<const unsigned char *>(str); + const unsigned char* s = reinterpret_cast<const unsigned char*>(str); for (size_t i = 0; i < length; i++) { unsigned char ch = *s++; if (std::iscntrl(ch)) { - switch (ch) { + switch (ch) { case '\t': case '\n': case '\r': break; default: return true; - } } + } } return false; } -bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; } +bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; } bool IsValidUTF8(const char* str, size_t length) { - const unsigned char *s = reinterpret_cast<const unsigned char *>(str); + const unsigned char* s = reinterpret_cast<const unsigned char*>(str); for (size_t i = 0; i < length;) { unsigned char lead = s[i++]; @@ -470,15 +492,13 @@ bool IsValidUTF8(const char* str, size_t length) { } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) { ++i; // 2-byte character } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length && - IsUTF8TrailByte(s[i]) && - IsUTF8TrailByte(s[i + 1]) && + IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) && // check for non-shortest form and surrogate (lead != 0xe0 || s[i] >= 0xa0) && (lead != 0xed || s[i] < 0xa0)) { i += 2; // 3-byte character } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length && - IsUTF8TrailByte(s[i]) && - IsUTF8TrailByte(s[i + 1]) && + IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) && IsUTF8TrailByte(s[i + 2]) && // check for non-shortest form (lead != 0xf0 || s[i] >= 0x90) && @@ -502,7 +522,7 @@ void ConditionalPrintAsText(const char* str, size_t length, ostream* os) { void PrintStringTo(const ::std::string& s, ostream* os) { if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) { - if (GTEST_FLAG(print_utf8)) { + if (GTEST_FLAG_GET(print_utf8)) { ConditionalPrintAsText(s.data(), s.size(), os); } } diff --git a/libvpx/third_party/googletest/src/src/gtest-test-part.cc b/libvpx/third_party/googletest/src/src/gtest-test-part.cc index a938683ce..eb7c8d1cf 100644 --- a/libvpx/third_party/googletest/src/src/gtest-test-part.cc +++ b/libvpx/third_party/googletest/src/src/gtest-test-part.cc @@ -51,13 +51,11 @@ std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { return os << internal::FormatFileLocation(result.file_name(), result.line_number()) << " " - << (result.type() == TestPartResult::kSuccess - ? "Success" - : result.type() == TestPartResult::kSkip - ? "Skipped" - : result.type() == TestPartResult::kFatalFailure - ? "Fatal failure" - : "Non-fatal failure") + << (result.type() == TestPartResult::kSuccess ? "Success" + : result.type() == TestPartResult::kSkip ? "Skipped" + : result.type() == TestPartResult::kFatalFailure + ? "Fatal failure" + : "Non-fatal failure") << ":\n" << result.message() << std::endl; } @@ -86,8 +84,8 @@ namespace internal { HasNewFatalFailureHelper::HasNewFatalFailureHelper() : has_new_fatal_failure_(false), - original_reporter_(GetUnitTestImpl()-> - GetTestPartResultReporterForCurrentThread()) { + original_reporter_( + GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) { GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this); } @@ -98,8 +96,7 @@ HasNewFatalFailureHelper::~HasNewFatalFailureHelper() { void HasNewFatalFailureHelper::ReportTestPartResult( const TestPartResult& result) { - if (result.fatally_failed()) - has_new_fatal_failure_ = true; + if (result.fatally_failed()) has_new_fatal_failure_ = true; original_reporter_->ReportTestPartResult(result); } diff --git a/libvpx/third_party/googletest/src/src/gtest-typed-test.cc b/libvpx/third_party/googletest/src/src/gtest-typed-test.cc index c02c3df65..a2828b83c 100644 --- a/libvpx/third_party/googletest/src/src/gtest-typed-test.cc +++ b/libvpx/third_party/googletest/src/src/gtest-typed-test.cc @@ -27,7 +27,6 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #include "gtest/gtest-typed-test.h" #include "gtest/gtest.h" @@ -38,8 +37,7 @@ namespace internal { // Skips to the first non-space char in str. Returns an empty string if str // contains only whitespace characters. static const char* SkipSpaces(const char* str) { - while (IsSpace(*str)) - str++; + while (IsSpace(*str)) str++; return str; } @@ -85,8 +83,7 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames( } for (RegisteredTestIter it = registered_tests_.begin(); - it != registered_tests_.end(); - ++it) { + it != registered_tests_.end(); ++it) { if (tests.count(it->first) == 0) { errors << "You forgot to list test " << it->first << ".\n"; } diff --git a/libvpx/third_party/googletest/src/src/gtest.cc b/libvpx/third_party/googletest/src/src/gtest.cc index 21c611aff..6f31dd226 100644 --- a/libvpx/third_party/googletest/src/src/gtest.cc +++ b/libvpx/third_party/googletest/src/src/gtest.cc @@ -31,8 +31,6 @@ // The Google C++ Testing and Mocking Framework (Google Test) #include "gtest/gtest.h" -#include "gtest/internal/custom/gtest.h" -#include "gtest/gtest-spi.h" #include <ctype.h> #include <stdarg.h> @@ -46,79 +44,87 @@ #include <chrono> // NOLINT #include <cmath> #include <cstdint> +#include <initializer_list> #include <iomanip> +#include <iterator> #include <limits> #include <list> #include <map> #include <ostream> // NOLINT #include <sstream> +#include <unordered_set> #include <vector> +#include "gtest/gtest-assertion-result.h" +#include "gtest/gtest-spi.h" +#include "gtest/internal/custom/gtest.h" + #if GTEST_OS_LINUX -# include <fcntl.h> // NOLINT -# include <limits.h> // NOLINT -# include <sched.h> // NOLINT +#include <fcntl.h> // NOLINT +#include <limits.h> // NOLINT +#include <sched.h> // NOLINT // Declares vsnprintf(). This header is not available on Windows. -# include <strings.h> // NOLINT -# include <sys/mman.h> // NOLINT -# include <sys/time.h> // NOLINT -# include <unistd.h> // NOLINT -# include <string> +#include <strings.h> // NOLINT +#include <sys/mman.h> // NOLINT +#include <sys/time.h> // NOLINT +#include <unistd.h> // NOLINT + +#include <string> #elif GTEST_OS_ZOS -# include <sys/time.h> // NOLINT +#include <sys/time.h> // NOLINT // On z/OS we additionally need strings.h for strcasecmp. -# include <strings.h> // NOLINT +#include <strings.h> // NOLINT #elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE. -# include <windows.h> // NOLINT -# undef min +#include <windows.h> // NOLINT +#undef min #elif GTEST_OS_WINDOWS // We are on Windows proper. -# include <windows.h> // NOLINT -# undef min +#include <windows.h> // NOLINT +#undef min #ifdef _MSC_VER -# include <crtdbg.h> // NOLINT +#include <crtdbg.h> // NOLINT #endif -# include <io.h> // NOLINT -# include <sys/timeb.h> // NOLINT -# include <sys/types.h> // NOLINT -# include <sys/stat.h> // NOLINT +#include <io.h> // NOLINT +#include <sys/stat.h> // NOLINT +#include <sys/timeb.h> // NOLINT +#include <sys/types.h> // NOLINT -# if GTEST_OS_WINDOWS_MINGW -# include <sys/time.h> // NOLINT -# endif // GTEST_OS_WINDOWS_MINGW +#if GTEST_OS_WINDOWS_MINGW +#include <sys/time.h> // NOLINT +#endif // GTEST_OS_WINDOWS_MINGW #else // cpplint thinks that the header is already included, so we want to // silence it. -# include <sys/time.h> // NOLINT -# include <unistd.h> // NOLINT +#include <sys/time.h> // NOLINT +#include <unistd.h> // NOLINT #endif // GTEST_OS_LINUX #if GTEST_HAS_EXCEPTIONS -# include <stdexcept> +#include <stdexcept> #endif #if GTEST_CAN_STREAM_RESULTS_ -# include <arpa/inet.h> // NOLINT -# include <netdb.h> // NOLINT -# include <sys/socket.h> // NOLINT -# include <sys/types.h> // NOLINT +#include <arpa/inet.h> // NOLINT +#include <netdb.h> // NOLINT +#include <sys/socket.h> // NOLINT +#include <sys/types.h> // NOLINT #endif #include "src/gtest-internal-inl.h" #if GTEST_OS_WINDOWS -# define vsnprintf _vsnprintf +#define vsnprintf _vsnprintf #endif // GTEST_OS_WINDOWS #if GTEST_OS_MAC @@ -131,7 +137,10 @@ #include "absl/debugging/failure_signal_handler.h" #include "absl/debugging/stacktrace.h" #include "absl/debugging/symbolize.h" +#include "absl/flags/parse.h" +#include "absl/flags/usage.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_replace.h" #endif // GTEST_HAS_ABSL namespace testing { @@ -177,7 +186,7 @@ const char kStackTraceMarker[] = "\nStack trace:\n"; // is specified on the command line. bool g_help_flag = false; -// Utilty function to Open File for Writing +// Utility function to Open File for Writing static FILE* OpenFileForWriting(const std::string& output_file) { FILE* fileout = nullptr; FilePath output_file_path(output_file); @@ -216,28 +225,33 @@ static bool GetDefaultFailFast() { return false; } +} // namespace testing + GTEST_DEFINE_bool_( - fail_fast, internal::BoolFromGTestEnv("fail_fast", GetDefaultFailFast()), + fail_fast, + testing::internal::BoolFromGTestEnv("fail_fast", + testing::GetDefaultFailFast()), "True if and only if a test failure should stop further test execution."); GTEST_DEFINE_bool_( also_run_disabled_tests, - internal::BoolFromGTestEnv("also_run_disabled_tests", false), + testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false), "Run disabled tests too, in addition to the tests normally being run."); GTEST_DEFINE_bool_( - break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false), + break_on_failure, + testing::internal::BoolFromGTestEnv("break_on_failure", false), "True if and only if a failed assertion should be a debugger " "break-point."); GTEST_DEFINE_bool_(catch_exceptions, - internal::BoolFromGTestEnv("catch_exceptions", true), + testing::internal::BoolFromGTestEnv("catch_exceptions", + true), "True if and only if " GTEST_NAME_ " should catch exceptions and treat them as test failures."); GTEST_DEFINE_string_( - color, - internal::StringFromGTestEnv("color", "auto"), + color, testing::internal::StringFromGTestEnv("color", "auto"), "Whether to use colors in the output. Valid values: yes, no, " "and auto. 'auto' means to use colors if the output is " "being sent to a terminal and the TERM environment variable " @@ -245,7 +259,8 @@ GTEST_DEFINE_string_( GTEST_DEFINE_string_( filter, - internal::StringFromGTestEnv("filter", GetDefaultFilter()), + testing::internal::StringFromGTestEnv("filter", + testing::GetDefaultFilter()), "A colon-separated list of glob (not regex) patterns " "for filtering the tests to run, optionally followed by a " "'-' and a : separated list of negative patterns (tests to " @@ -254,13 +269,14 @@ GTEST_DEFINE_string_( GTEST_DEFINE_bool_( install_failure_signal_handler, - internal::BoolFromGTestEnv("install_failure_signal_handler", false), - "If true and supported on the current platform, " GTEST_NAME_ " should " + testing::internal::BoolFromGTestEnv("install_failure_signal_handler", + false), + "If true and supported on the current platform, " GTEST_NAME_ + " should " "install a signal handler that dumps debugging information when fatal " "signals are raised."); -GTEST_DEFINE_bool_(list_tests, false, - "List all tests without running them."); +GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them."); // The net priority order after flag processing is thus: // --gtest_output command line flag @@ -269,8 +285,8 @@ GTEST_DEFINE_bool_(list_tests, false, // '' GTEST_DEFINE_string_( output, - internal::StringFromGTestEnv("output", - internal::OutputFlagAlsoCheckEnvVar().c_str()), + testing::internal::StringFromGTestEnv( + "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()), "A format (defaults to \"xml\" but can be specified to be \"json\"), " "optionally followed by a colon and an output file name or directory. " "A directory is indicated by a trailing pathname separator. " @@ -281,65 +297,79 @@ GTEST_DEFINE_string_( "digits."); GTEST_DEFINE_bool_( - brief, internal::BoolFromGTestEnv("brief", false), + brief, testing::internal::BoolFromGTestEnv("brief", false), "True if only test failures should be displayed in text output."); -GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true), +GTEST_DEFINE_bool_(print_time, + testing::internal::BoolFromGTestEnv("print_time", true), "True if and only if " GTEST_NAME_ " should display elapsed time in text output."); -GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true), +GTEST_DEFINE_bool_(print_utf8, + testing::internal::BoolFromGTestEnv("print_utf8", true), "True if and only if " GTEST_NAME_ " prints UTF8 characters as text."); GTEST_DEFINE_int32_( - random_seed, - internal::Int32FromGTestEnv("random_seed", 0), + random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0), "Random number seed to use when shuffling test orders. Must be in range " "[1, 99999], or 0 to use a seed based on the current time."); GTEST_DEFINE_int32_( - repeat, - internal::Int32FromGTestEnv("repeat", 1), + repeat, testing::internal::Int32FromGTestEnv("repeat", 1), "How many times to repeat each test. Specify a negative number " "for repeating forever. Useful for shaking out flaky tests."); +GTEST_DEFINE_bool_( + recreate_environments_when_repeating, + testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating", + false), + "Controls whether global test environments are recreated for each repeat " + "of the tests. If set to false the global test environments are only set " + "up once, for the first iteration, and only torn down once, for the last. " + "Useful for shaking out flaky tests with stable, expensive test " + "environments. If --gtest_repeat is set to a negative number, meaning " + "there is no last run, the environments will always be recreated to avoid " + "leaks."); + GTEST_DEFINE_bool_(show_internal_stack_frames, false, "True if and only if " GTEST_NAME_ " should include internal stack frames when " "printing test failure stack traces."); -GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false), +GTEST_DEFINE_bool_(shuffle, + testing::internal::BoolFromGTestEnv("shuffle", false), "True if and only if " GTEST_NAME_ " should randomize tests' order on every run."); GTEST_DEFINE_int32_( stack_trace_depth, - internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth), + testing::internal::Int32FromGTestEnv("stack_trace_depth", + testing::kMaxStackTraceDepth), "The maximum number of stack frames to print when an " "assertion fails. The valid range is 0 through 100, inclusive."); GTEST_DEFINE_string_( stream_result_to, - internal::StringFromGTestEnv("stream_result_to", ""), + testing::internal::StringFromGTestEnv("stream_result_to", ""), "This flag specifies the host name and the port number on which to stream " "test results. Example: \"localhost:555\". The flag is effective only on " "Linux."); GTEST_DEFINE_bool_( throw_on_failure, - internal::BoolFromGTestEnv("throw_on_failure", false), + testing::internal::BoolFromGTestEnv("throw_on_failure", false), "When this flag is specified, a failed assertion will throw an exception " "if exceptions are enabled or exit the program with a non-zero code " "otherwise. For use with an external test framework."); #if GTEST_USE_OWN_FLAGFILE_FLAG_ GTEST_DEFINE_string_( - flagfile, - internal::StringFromGTestEnv("flagfile", ""), + flagfile, testing::internal::StringFromGTestEnv("flagfile", ""), "This flag specifies the flagfile to read command-line flags from."); #endif // GTEST_USE_OWN_FLAGFILE_FLAG_ +namespace testing { namespace internal { // Generates a random number from [0, range), using a Linear @@ -348,10 +378,9 @@ namespace internal { uint32_t Random::Generate(uint32_t range) { // These constants are the same as are used in glibc's rand(3). // Use wider types than necessary to prevent unsigned overflow diagnostics. - state_ = static_cast<uint32_t>(1103515245ULL*state_ + 12345U) % kMaxRange; + state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange; - GTEST_CHECK_(range > 0) - << "Cannot generate a number in the range [0, 0)."; + GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0)."; GTEST_CHECK_(range <= kMaxRange) << "Generation of a number in [0, " << range << ") was requested, " << "but this can only generate numbers in [0, " << kMaxRange << ")."; @@ -396,32 +425,26 @@ static bool ShouldRunTestSuite(const TestSuite* test_suite) { } // AssertHelper constructor. -AssertHelper::AssertHelper(TestPartResult::Type type, - const char* file, - int line, - const char* message) - : data_(new AssertHelperData(type, file, line, message)) { -} +AssertHelper::AssertHelper(TestPartResult::Type type, const char* file, + int line, const char* message) + : data_(new AssertHelperData(type, file, line, message)) {} -AssertHelper::~AssertHelper() { - delete data_; -} +AssertHelper::~AssertHelper() { delete data_; } // Message assignment, for assertion streaming support. void AssertHelper::operator=(const Message& message) const { - UnitTest::GetInstance()-> - AddTestPartResult(data_->type, data_->file, data_->line, - AppendUserMessage(data_->message, message), - UnitTest::GetInstance()->impl() - ->CurrentOsStackTraceExceptTop(1) - // Skips the stack frame for this function itself. - ); // NOLINT + UnitTest::GetInstance()->AddTestPartResult( + data_->type, data_->file, data_->line, + AppendUserMessage(data_->message, message), + UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1) + // Skips the stack frame for this function itself. + ); // NOLINT } namespace { // When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P -// to creates test cases for it, a syntetic test case is +// to creates test cases for it, a synthetic test case is // inserted to report ether an error or a log message. // // This configuration bit will likely be removed at some point. @@ -452,7 +475,6 @@ class FailureTest : public Test { const bool as_error_; }; - } // namespace std::set<std::string>* GetIgnoredParameterizedTestSuites() { @@ -496,7 +518,8 @@ void InsertSyntheticTestCase(const std::string& name, CodeLocation location, "To suppress this error for this test suite, insert the following line " "(in a non-header) in the namespace it is defined in:" "\n\n" - "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");"; + "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + + name + ");"; std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">"; RegisterTest( // @@ -516,19 +539,18 @@ void RegisterTypeParameterizedTestSuite(const char* test_suite_name, } void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) { - GetUnitTestImpl() - ->type_parameterized_test_registry() - .RegisterInstantiation(case_name); + GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation( + case_name); } void TypeParameterizedTestSuiteRegistry::RegisterTestSuite( const char* test_suite_name, CodeLocation code_location) { suites_.emplace(std::string(test_suite_name), - TypeParameterizedTestSuiteInfo(code_location)); + TypeParameterizedTestSuiteInfo(code_location)); } void TypeParameterizedTestSuiteRegistry::RegisterInstantiation( - const char* test_suite_name) { + const char* test_suite_name) { auto it = suites_.find(std::string(test_suite_name)); if (it != suites_.end()) { it->second.instantiated = true; @@ -606,7 +628,8 @@ FilePath GetCurrentExecutableName() { // Returns the output format, or "" for normal printed output. std::string UnitTestOptions::GetOutputFormat() { - const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); + std::string s = GTEST_FLAG_GET(output); + const char* const gtest_output_flag = s.c_str(); const char* const colon = strchr(gtest_output_flag, ':'); return (colon == nullptr) ? std::string(gtest_output_flag) @@ -617,19 +640,19 @@ std::string UnitTestOptions::GetOutputFormat() { // Returns the name of the requested output file, or the default if none // was explicitly specified. std::string UnitTestOptions::GetAbsolutePathToOutputFile() { - const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); + std::string s = GTEST_FLAG_GET(output); + const char* const gtest_output_flag = s.c_str(); std::string format = GetOutputFormat(); - if (format.empty()) - format = std::string(kDefaultOutputFormat); + if (format.empty()) format = std::string(kDefaultOutputFormat); const char* const colon = strchr(gtest_output_flag, ':'); if (colon == nullptr) return internal::FilePath::MakeFileName( - internal::FilePath( - UnitTest::GetInstance()->original_working_dir()), - internal::FilePath(kDefaultOutputFile), 0, - format.c_str()).string(); + internal::FilePath( + UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(kDefaultOutputFile), 0, format.c_str()) + .string(); internal::FilePath output_name(colon + 1); if (!output_name.IsAbsolutePath()) @@ -637,8 +660,7 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() { internal::FilePath(UnitTest::GetInstance()->original_working_dir()), internal::FilePath(colon + 1)); - if (!output_name.IsDirectory()) - return output_name.string(); + if (!output_name.IsDirectory()) return output_name.string(); internal::FilePath result(internal::FilePath::GenerateUniqueFileName( output_name, internal::GetCurrentExecutableName(), @@ -699,59 +721,119 @@ static bool PatternMatchesString(const std::string& name_str, return true; } -bool UnitTestOptions::MatchesFilter(const std::string& name_str, - const char* filter) { - // The filter is a list of patterns separated by colons (:). - const char* pattern = filter; - while (true) { - // Find the bounds of this pattern. - const char* const next_sep = strchr(pattern, ':'); - const char* const pattern_end = - next_sep != nullptr ? next_sep : pattern + strlen(pattern); - - // Check if this pattern matches name_str. - if (PatternMatchesString(name_str, pattern, pattern_end)) { - return true; - } +namespace { + +bool IsGlobPattern(const std::string& pattern) { + return std::any_of(pattern.begin(), pattern.end(), + [](const char c) { return c == '?' || c == '*'; }); +} + +class UnitTestFilter { + public: + UnitTestFilter() = default; + + // Constructs a filter from a string of patterns separated by `:`. + explicit UnitTestFilter(const std::string& filter) { + // By design "" filter matches "" string. + std::vector<std::string> all_patterns; + SplitString(filter, ':', &all_patterns); + const auto exact_match_patterns_begin = std::partition( + all_patterns.begin(), all_patterns.end(), &IsGlobPattern); + + glob_patterns_.reserve(static_cast<size_t>( + std::distance(all_patterns.begin(), exact_match_patterns_begin))); + std::move(all_patterns.begin(), exact_match_patterns_begin, + std::inserter(glob_patterns_, glob_patterns_.begin())); + std::move( + exact_match_patterns_begin, all_patterns.end(), + std::inserter(exact_match_patterns_, exact_match_patterns_.begin())); + } + + // Returns true if and only if name matches at least one of the patterns in + // the filter. + bool MatchesName(const std::string& name) const { + return exact_match_patterns_.count(name) > 0 || + std::any_of(glob_patterns_.begin(), glob_patterns_.end(), + [&name](const std::string& pattern) { + return PatternMatchesString( + name, pattern.c_str(), + pattern.c_str() + pattern.size()); + }); + } + + private: + std::vector<std::string> glob_patterns_; + std::unordered_set<std::string> exact_match_patterns_; +}; - // Give up on this pattern. However, if we found a pattern separator (:), - // advance to the next pattern (skipping over the separator) and restart. - if (next_sep == nullptr) { - return false; +class PositiveAndNegativeUnitTestFilter { + public: + // Constructs a positive and a negative filter from a string. The string + // contains a positive filter optionally followed by a '-' character and a + // negative filter. In case only a negative filter is provided the positive + // filter will be assumed "*". + // A filter is a list of patterns separated by ':'. + explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) { + std::vector<std::string> positive_and_negative_filters; + + // NOTE: `SplitString` always returns a non-empty container. + SplitString(filter, '-', &positive_and_negative_filters); + const auto& positive_filter = positive_and_negative_filters.front(); + + if (positive_and_negative_filters.size() > 1) { + positive_filter_ = UnitTestFilter( + positive_filter.empty() ? kUniversalFilter : positive_filter); + + // TODO(b/214626361): Fail on multiple '-' characters + // For the moment to preserve old behavior we concatenate the rest of the + // string parts with `-` as separator to generate the negative filter. + auto negative_filter_string = positive_and_negative_filters[1]; + for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++) + negative_filter_string = + negative_filter_string + '-' + positive_and_negative_filters[i]; + negative_filter_ = UnitTestFilter(negative_filter_string); + } else { + // In case we don't have a negative filter and positive filter is "" + // we do not use kUniversalFilter by design as opposed to when we have a + // negative filter. + positive_filter_ = UnitTestFilter(positive_filter); } - pattern = next_sep + 1; } - return true; + + // Returns true if and only if test name (this is generated by appending test + // suit name and test name via a '.' character) matches the positive filter + // and does not match the negative filter. + bool MatchesTest(const std::string& test_suite_name, + const std::string& test_name) const { + return MatchesName(test_suite_name + "." + test_name); + } + + // Returns true if and only if name matches the positive filter and does not + // match the negative filter. + bool MatchesName(const std::string& name) const { + return positive_filter_.MatchesName(name) && + !negative_filter_.MatchesName(name); + } + + private: + UnitTestFilter positive_filter_; + UnitTestFilter negative_filter_; +}; +} // namespace + +bool UnitTestOptions::MatchesFilter(const std::string& name_str, + const char* filter) { + return UnitTestFilter(filter).MatchesName(name_str); } // Returns true if and only if the user-specified filter matches the test // suite name and the test name. bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name, const std::string& test_name) { - const std::string& full_name = test_suite_name + "." + test_name.c_str(); - // Split --gtest_filter at '-', if there is one, to separate into // positive filter and negative filter portions - const char* const p = GTEST_FLAG(filter).c_str(); - const char* const dash = strchr(p, '-'); - std::string positive; - std::string negative; - if (dash == nullptr) { - positive = GTEST_FLAG(filter).c_str(); // Whole string is a positive filter - negative = ""; - } else { - positive = std::string(p, dash); // Everything up to the dash - negative = std::string(dash + 1); // Everything after the dash - if (positive.empty()) { - // Treat '-test1' as the same as '*-test1' - positive = kUniversalFilter; - } - } - - // A filter is a colon-separated list of patterns. It matches a - // test if any pattern in it matches the test. - return (MatchesFilter(full_name, positive.c_str()) && - !MatchesFilter(full_name, negative.c_str())); + return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter)) + .MatchesTest(test_suite_name, test_name); } #if GTEST_HAS_SEH @@ -771,7 +853,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { bool should_handle = true; - if (!GTEST_FLAG(catch_exceptions)) + if (!GTEST_FLAG_GET(catch_exceptions)) should_handle = false; else if (exception_code == EXCEPTION_BREAKPOINT) should_handle = false; @@ -789,8 +871,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { // results. Intercepts only failures from the current thread. ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( TestPartResultArray* result) - : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), - result_(result) { + : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) { Init(); } @@ -799,8 +880,7 @@ ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( // results. ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( InterceptMode intercept_mode, TestPartResultArray* result) - : intercept_mode_(intercept_mode), - result_(result) { + : intercept_mode_(intercept_mode), result_(result) { Init(); } @@ -844,9 +924,7 @@ namespace internal { // from user test code. GetTestTypeId() is guaranteed to always // return the same value, as it always calls GetTypeId<>() from the // gtest.cc, which is within the Google Test framework. -TypeId GetTestTypeId() { - return GetTypeId<Test>(); -} +TypeId GetTestTypeId() { return GetTypeId<Test>(); } // The value of GetTestTypeId() as seen from within the Google Test // library. This is solely for testing GetTestTypeId(). @@ -861,9 +939,9 @@ static AssertionResult HasOneFailure(const char* /* results_expr */, const TestPartResultArray& results, TestPartResult::Type type, const std::string& substr) { - const std::string expected(type == TestPartResult::kFatalFailure ? - "1 fatal failure" : - "1 non-fatal failure"); + const std::string expected(type == TestPartResult::kFatalFailure + ? "1 fatal failure" + : "1 non-fatal failure"); Message msg; if (results.size() != 1) { msg << "Expected: " << expected << "\n" @@ -882,10 +960,10 @@ static AssertionResult HasOneFailure(const char* /* results_expr */, } if (strstr(r.message(), substr.c_str()) == nullptr) { - return AssertionFailure() << "Expected: " << expected << " containing \"" - << substr << "\"\n" - << " Actual:\n" - << r; + return AssertionFailure() + << "Expected: " << expected << " containing \"" << substr << "\"\n" + << " Actual:\n" + << r; } return AssertionSuccess(); @@ -908,7 +986,8 @@ SingleFailureChecker::~SingleFailureChecker() { } DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter( - UnitTestImpl* unit_test) : unit_test_(unit_test) {} + UnitTestImpl* unit_test) + : unit_test_(unit_test) {} void DefaultGlobalTestPartResultReporter::ReportTestPartResult( const TestPartResult& result) { @@ -917,7 +996,8 @@ void DefaultGlobalTestPartResultReporter::ReportTestPartResult( } DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter( - UnitTestImpl* unit_test) : unit_test_(unit_test) {} + UnitTestImpl* unit_test) + : unit_test_(unit_test) {} void DefaultPerThreadTestPartResultReporter::ReportTestPartResult( const TestPartResult& result) { @@ -1024,11 +1104,10 @@ int UnitTestImpl::test_to_run_count() const { // trace but Bar() and CurrentOsStackTraceExceptTop() won't. std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { return os_stack_trace_getter()->CurrentStackTrace( - static_cast<int>(GTEST_FLAG(stack_trace_depth)), - skip_count + 1 + static_cast<int>(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1 // Skips the user-specified number of frames plus this function // itself. - ); // NOLINT + ); // NOLINT } // A helper class for measuring elapsed times. @@ -1072,8 +1151,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) { const int unicode_length = MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0); WCHAR* unicode = new WCHAR[unicode_length + 1]; - MultiByteToWideChar(CP_ACP, 0, ansi, length, - unicode, unicode_length); + MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length); unicode[unicode_length] = 0; return unicode; } @@ -1082,7 +1160,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) { // memory using new. The caller is responsible for deleting the return // value using delete[]. Returns the ANSI string, or NULL if the // input is NULL. -const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { +const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { if (!utf16_str) return nullptr; const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr, 0, nullptr, nullptr); @@ -1101,7 +1179,7 @@ const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { // Unlike strcmp(), this function can handle NULL argument(s). A NULL // C string is considered different to any non-NULL C string, // including the empty string. -bool String::CStringEquals(const char * lhs, const char * rhs) { +bool String::CStringEquals(const char* lhs, const char* rhs) { if (lhs == nullptr) return rhs == nullptr; if (rhs == nullptr) return false; @@ -1115,11 +1193,10 @@ bool String::CStringEquals(const char * lhs, const char * rhs) { // encoding, and streams the result to the given Message object. static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length, Message* msg) { - for (size_t i = 0; i != length; ) { // NOLINT + for (size_t i = 0; i != length;) { // NOLINT if (wstr[i] != L'\0') { *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i)); - while (i != length && wstr[i] != L'\0') - i++; + while (i != length && wstr[i] != L'\0') i++; } else { *msg << '\0'; i++; @@ -1161,17 +1238,17 @@ Message::Message() : ss_(new ::std::stringstream) { // These two overloads allow streaming a wide C string to a Message // using the UTF-8 encoding. -Message& Message::operator <<(const wchar_t* wide_c_str) { +Message& Message::operator<<(const wchar_t* wide_c_str) { return *this << internal::String::ShowWideCString(wide_c_str); } -Message& Message::operator <<(wchar_t* wide_c_str) { +Message& Message::operator<<(wchar_t* wide_c_str) { return *this << internal::String::ShowWideCString(wide_c_str); } #if GTEST_HAS_STD_WSTRING // Converts the given wide string to a narrow string using the UTF-8 // encoding, and streams the result to this Message object. -Message& Message::operator <<(const ::std::wstring& wstr) { +Message& Message::operator<<(const ::std::wstring& wstr) { internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); return *this; } @@ -1183,44 +1260,6 @@ std::string Message::GetString() const { return internal::StringStreamToString(ss_.get()); } -// AssertionResult constructors. -// Used in EXPECT_TRUE/FALSE(assertion_result). -AssertionResult::AssertionResult(const AssertionResult& other) - : success_(other.success_), - message_(other.message_.get() != nullptr - ? new ::std::string(*other.message_) - : static_cast< ::std::string*>(nullptr)) {} - -// Swaps two AssertionResults. -void AssertionResult::swap(AssertionResult& other) { - using std::swap; - swap(success_, other.success_); - swap(message_, other.message_); -} - -// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. -AssertionResult AssertionResult::operator!() const { - AssertionResult negation(!success_); - if (message_.get() != nullptr) negation << *message_; - return negation; -} - -// Makes a successful assertion result. -AssertionResult AssertionSuccess() { - return AssertionResult(true); -} - -// Makes a failed assertion result. -AssertionResult AssertionFailure() { - return AssertionResult(false); -} - -// Makes a failed assertion result with the given failure message. -// Deprecated; use AssertionFailure() << message. -AssertionResult AssertionFailure(const Message& message) { - return AssertionFailure() << message; -} - namespace internal { namespace edit_distance { @@ -1512,8 +1551,7 @@ std::vector<std::string> SplitEscapedString(const std::string& str) { AssertionResult EqFailure(const char* lhs_expression, const char* rhs_expression, const std::string& lhs_value, - const std::string& rhs_value, - bool ignoring_case) { + const std::string& rhs_value, bool ignoring_case) { Message msg; msg << "Expected equality of these values:"; msg << "\n " << lhs_expression; @@ -1530,10 +1568,8 @@ AssertionResult EqFailure(const char* lhs_expression, } if (!lhs_value.empty() && !rhs_value.empty()) { - const std::vector<std::string> lhs_lines = - SplitEscapedString(lhs_value); - const std::vector<std::string> rhs_lines = - SplitEscapedString(rhs_value); + const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value); + const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value); if (lhs_lines.size() > 1 || rhs_lines.size() > 1) { msg << "\nWith diff:\n" << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines); @@ -1545,27 +1581,21 @@ AssertionResult EqFailure(const char* lhs_expression, // Constructs a failure message for Boolean assertions such as EXPECT_TRUE. std::string GetBoolAssertionFailureMessage( - const AssertionResult& assertion_result, - const char* expression_text, - const char* actual_predicate_value, - const char* expected_predicate_value) { + const AssertionResult& assertion_result, const char* expression_text, + const char* actual_predicate_value, const char* expected_predicate_value) { const char* actual_message = assertion_result.message(); Message msg; msg << "Value of: " << expression_text << "\n Actual: " << actual_predicate_value; - if (actual_message[0] != '\0') - msg << " (" << actual_message << ")"; + if (actual_message[0] != '\0') msg << " (" << actual_message << ")"; msg << "\nExpected: " << expected_predicate_value; return msg.GetString(); } // Helper function for implementing ASSERT_NEAR. -AssertionResult DoubleNearPredFormat(const char* expr1, - const char* expr2, - const char* abs_error_expr, - double val1, - double val2, - double abs_error) { +AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2, + const char* abs_error_expr, double val1, + double val2, double abs_error) { const double diff = fabs(val1 - val2); if (diff <= abs_error) return AssertionSuccess(); @@ -1595,20 +1625,17 @@ AssertionResult DoubleNearPredFormat(const char* expr1, "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead."; } return AssertionFailure() - << "The difference between " << expr1 << " and " << expr2 - << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n" - << expr1 << " evaluates to " << val1 << ",\n" - << expr2 << " evaluates to " << val2 << ", and\n" - << abs_error_expr << " evaluates to " << abs_error << "."; + << "The difference between " << expr1 << " and " << expr2 << " is " + << diff << ", which exceeds " << abs_error_expr << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ", and\n" + << abs_error_expr << " evaluates to " << abs_error << "."; } - // Helper template for implementing FloatLE() and DoubleLE(). template <typename RawType> -AssertionResult FloatingPointLE(const char* expr1, - const char* expr2, - RawType val1, - RawType val2) { +AssertionResult FloatingPointLE(const char* expr1, const char* expr2, + RawType val1, RawType val2) { // Returns success if val1 is less than val2, if (val1 < val2) { return AssertionSuccess(); @@ -1633,24 +1660,24 @@ AssertionResult FloatingPointLE(const char* expr1, << val2; return AssertionFailure() - << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" - << " Actual: " << StringStreamToString(&val1_ss) << " vs " - << StringStreamToString(&val2_ss); + << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" + << " Actual: " << StringStreamToString(&val1_ss) << " vs " + << StringStreamToString(&val2_ss); } } // namespace internal // Asserts that val1 is less than, or almost equal to, val2. Fails // otherwise. In particular, it fails if either val1 or val2 is NaN. -AssertionResult FloatLE(const char* expr1, const char* expr2, - float val1, float val2) { +AssertionResult FloatLE(const char* expr1, const char* expr2, float val1, + float val2) { return internal::FloatingPointLE<float>(expr1, expr2, val1, val2); } // Asserts that val1 is less than, or almost equal to, val2. Fails // otherwise. In particular, it fails if either val1 or val2 is NaN. -AssertionResult DoubleLE(const char* expr1, const char* expr2, - double val1, double val2) { +AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1, + double val2) { return internal::FloatingPointLE<double>(expr1, expr2, val1, val2); } @@ -1658,62 +1685,51 @@ namespace internal { // The helper function for {ASSERT|EXPECT}_STREQ. AssertionResult CmpHelperSTREQ(const char* lhs_expression, - const char* rhs_expression, - const char* lhs, + const char* rhs_expression, const char* lhs, const char* rhs) { if (String::CStringEquals(lhs, rhs)) { return AssertionSuccess(); } - return EqFailure(lhs_expression, - rhs_expression, - PrintToString(lhs), - PrintToString(rhs), - false); + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), false); } // The helper function for {ASSERT|EXPECT}_STRCASEEQ. AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression, - const char* rhs_expression, - const char* lhs, + const char* rhs_expression, const char* lhs, const char* rhs) { if (String::CaseInsensitiveCStringEquals(lhs, rhs)) { return AssertionSuccess(); } - return EqFailure(lhs_expression, - rhs_expression, - PrintToString(lhs), - PrintToString(rhs), - true); + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), true); } // The helper function for {ASSERT|EXPECT}_STRNE. AssertionResult CmpHelperSTRNE(const char* s1_expression, - const char* s2_expression, - const char* s1, + const char* s2_expression, const char* s1, const char* s2) { if (!String::CStringEquals(s1, s2)) { return AssertionSuccess(); } else { - return AssertionFailure() << "Expected: (" << s1_expression << ") != (" - << s2_expression << "), actual: \"" - << s1 << "\" vs \"" << s2 << "\""; + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << "), actual: \"" << s1 << "\" vs \"" << s2 << "\""; } } // The helper function for {ASSERT|EXPECT}_STRCASENE. AssertionResult CmpHelperSTRCASENE(const char* s1_expression, - const char* s2_expression, - const char* s1, + const char* s2_expression, const char* s1, const char* s2) { if (!String::CaseInsensitiveCStringEquals(s1, s2)) { return AssertionSuccess(); } else { return AssertionFailure() - << "Expected: (" << s1_expression << ") != (" - << s2_expression << ") (ignoring case), actual: \"" - << s1 << "\" vs \"" << s2 << "\""; + << "Expected: (" << s1_expression << ") != (" << s2_expression + << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\""; } } @@ -1741,8 +1757,7 @@ bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) { // StringType here can be either ::std::string or ::std::wstring. template <typename StringType> -bool IsSubstringPred(const StringType& needle, - const StringType& haystack) { +bool IsSubstringPred(const StringType& needle, const StringType& haystack) { return haystack.find(needle) != StringType::npos; } @@ -1751,21 +1766,22 @@ bool IsSubstringPred(const StringType& needle, // StringType here can be const char*, const wchar_t*, ::std::string, // or ::std::wstring. template <typename StringType> -AssertionResult IsSubstringImpl( - bool expected_to_be_substring, - const char* needle_expr, const char* haystack_expr, - const StringType& needle, const StringType& haystack) { +AssertionResult IsSubstringImpl(bool expected_to_be_substring, + const char* needle_expr, + const char* haystack_expr, + const StringType& needle, + const StringType& haystack) { if (IsSubstringPred(needle, haystack) == expected_to_be_substring) return AssertionSuccess(); const bool is_wide_string = sizeof(needle[0]) > 1; const char* const begin_string_quote = is_wide_string ? "L\"" : "\""; return AssertionFailure() - << "Value of: " << needle_expr << "\n" - << " Actual: " << begin_string_quote << needle << "\"\n" - << "Expected: " << (expected_to_be_substring ? "" : "not ") - << "a substring of " << haystack_expr << "\n" - << "Which is: " << begin_string_quote << haystack << "\""; + << "Value of: " << needle_expr << "\n" + << " Actual: " << begin_string_quote << needle << "\"\n" + << "Expected: " << (expected_to_be_substring ? "" : "not ") + << "a substring of " << haystack_expr << "\n" + << "Which is: " << begin_string_quote << haystack << "\""; } } // namespace @@ -1774,52 +1790,52 @@ AssertionResult IsSubstringImpl( // substring of haystack (NULL is considered a substring of itself // only), and return an appropriate error message when they fail. -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack) { +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack) { return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack) { +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack) { return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack) { +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, const char* needle, + const char* haystack) { return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack) { +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, const wchar_t* needle, + const wchar_t* haystack) { return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack) { +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack) { return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack) { +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack) { return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); } #if GTEST_HAS_STD_WSTRING -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack) { +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack) { return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); } -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack) { +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack) { return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); } #endif // GTEST_HAS_STD_WSTRING @@ -1831,43 +1847,42 @@ namespace internal { namespace { // Helper function for IsHRESULT{SuccessFailure} predicates -AssertionResult HRESULTFailureHelper(const char* expr, - const char* expected, +AssertionResult HRESULTFailureHelper(const char* expr, const char* expected, long hr) { // NOLINT -# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE // Windows CE doesn't support FormatMessage. const char error_text[] = ""; -# else +#else // Looks up the human-readable system message for the HRESULT code // and since we're not passing any params to FormatMessage, we don't // want inserts expanded. - const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS; + const DWORD kFlags = + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS; const DWORD kBufSize = 4096; // Gets the system's human readable message string for this HRESULT. - char error_text[kBufSize] = { '\0' }; + char error_text[kBufSize] = {'\0'}; DWORD message_length = ::FormatMessageA(kFlags, - 0, // no source, we're asking system + 0, // no source, we're asking system static_cast<DWORD>(hr), // the error - 0, // no line width restrictions + 0, // no line width restrictions error_text, // output buffer kBufSize, // buf size nullptr); // no arguments for inserts // Trims tailing white space (FormatMessage leaves a trailing CR-LF) for (; message_length && IsSpace(error_text[message_length - 1]); - --message_length) { + --message_length) { error_text[message_length - 1] = '\0'; } -# endif // GTEST_OS_WINDOWS_MOBILE +#endif // GTEST_OS_WINDOWS_MOBILE const std::string error_hex("0x" + String::FormatHexInt(hr)); return ::testing::AssertionFailure() - << "Expected: " << expr << " " << expected << ".\n" - << " Actual: " << error_hex << " " << error_text << "\n"; + << "Expected: " << expr << " " << expected << ".\n" + << " Actual: " << error_hex << " " << error_text << "\n"; } } // namespace @@ -1901,16 +1916,18 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT // 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // The maximum code-point a one-byte UTF-8 sequence can represent. -constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1; +constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1; // The maximum code-point a two-byte UTF-8 sequence can represent. constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1; // The maximum code-point a three-byte UTF-8 sequence can represent. -constexpr uint32_t kMaxCodePoint3 = (static_cast<uint32_t>(1) << (4 + 2*6)) - 1; +constexpr uint32_t kMaxCodePoint3 = + (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1; // The maximum code-point a four-byte UTF-8 sequence can represent. -constexpr uint32_t kMaxCodePoint4 = (static_cast<uint32_t>(1) << (3 + 3*6)) - 1; +constexpr uint32_t kMaxCodePoint4 = + (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1; // Chops off the n lowest bits from a bit pattern. Returns the n // lowest bits. As a side effect, the original bit pattern will be @@ -1935,7 +1952,7 @@ std::string CodePointToUtf8(uint32_t code_point) { char str[5]; // Big enough for the largest valid code point. if (code_point <= kMaxCodePoint1) { str[1] = '\0'; - str[0] = static_cast<char>(code_point); // 0xxxxxxx + str[0] = static_cast<char>(code_point); // 0xxxxxxx } else if (code_point <= kMaxCodePoint2) { str[2] = '\0'; str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx @@ -1963,8 +1980,8 @@ std::string CodePointToUtf8(uint32_t code_point) { // and thus should be combined into a single Unicode code point // using CreateCodePointFromUtf16SurrogatePair. inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { - return sizeof(wchar_t) == 2 && - (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00; + return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 && + (second & 0xFC00) == 0xDC00; } // Creates a Unicode code point from UTF16 surrogate pair. @@ -1995,8 +2012,7 @@ inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first, // and contains invalid UTF-16 surrogate pairs, values in those pairs // will be encoded as individual Unicode characters from Basic Normal Plane. std::string WideStringToUtf8(const wchar_t* str, int num_chars) { - if (num_chars == -1) - num_chars = static_cast<int>(wcslen(str)); + if (num_chars == -1) num_chars = static_cast<int>(wcslen(str)); ::std::stringstream stream; for (int i = 0; i < num_chars; ++i) { @@ -2005,8 +2021,8 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) { if (str[i] == L'\0') { break; } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { - unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i], - str[i + 1]); + unicode_code_point = + CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]); i++; } else { unicode_code_point = static_cast<uint32_t>(str[i]); @@ -2019,7 +2035,7 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) { // Converts a wide C string to an std::string using the UTF-8 encoding. // NULL will be converted to "(null)". -std::string String::ShowWideCString(const wchar_t * wide_c_str) { +std::string String::ShowWideCString(const wchar_t* wide_c_str) { if (wide_c_str == nullptr) return "(null)"; return internal::WideStringToUtf8(wide_c_str, -1); @@ -2031,7 +2047,7 @@ std::string String::ShowWideCString(const wchar_t * wide_c_str) { // Unlike wcscmp(), this function can handle NULL argument(s). A NULL // C string is considered different to any non-NULL C string, // including the empty string. -bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) { +bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) { if (lhs == nullptr) return rhs == nullptr; if (rhs == nullptr) return false; @@ -2041,33 +2057,27 @@ bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) { // Helper function for *_STREQ on wide strings. AssertionResult CmpHelperSTREQ(const char* lhs_expression, - const char* rhs_expression, - const wchar_t* lhs, + const char* rhs_expression, const wchar_t* lhs, const wchar_t* rhs) { if (String::WideCStringEquals(lhs, rhs)) { return AssertionSuccess(); } - return EqFailure(lhs_expression, - rhs_expression, - PrintToString(lhs), - PrintToString(rhs), - false); + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), false); } // Helper function for *_STRNE on wide strings. AssertionResult CmpHelperSTRNE(const char* s1_expression, - const char* s2_expression, - const wchar_t* s1, + const char* s2_expression, const wchar_t* s1, const wchar_t* s2) { if (!String::WideCStringEquals(s1, s2)) { return AssertionSuccess(); } - return AssertionFailure() << "Expected: (" << s1_expression << ") != (" - << s2_expression << "), actual: " - << PrintToString(s1) - << " vs " << PrintToString(s2); + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2); } // Compares two C strings, ignoring case. Returns true if and only if they have @@ -2076,7 +2086,7 @@ AssertionResult CmpHelperSTRNE(const char* s1_expression, // Unlike strcasecmp(), this function can handle NULL argument(s). A // NULL C string is considered different to any non-NULL C string, // including the empty string. -bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) { +bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) { if (lhs == nullptr) return rhs == nullptr; if (rhs == nullptr) return false; return posix::StrCaseCmp(lhs, rhs) == 0; @@ -2118,8 +2128,8 @@ bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, // Returns true if and only if str ends with the given suffix, ignoring case. // Any string is considered to end with an empty suffix. -bool String::EndsWithCaseInsensitive( - const std::string& str, const std::string& suffix) { +bool String::EndsWithCaseInsensitive(const std::string& str, + const std::string& suffix) { const size_t str_len = str.length(); const size_t suffix_len = suffix.length(); return (str_len >= suffix_len) && @@ -2202,15 +2212,13 @@ TestResult::TestResult() : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {} // D'tor. -TestResult::~TestResult() { -} +TestResult::~TestResult() {} // Returns the i-th test part result among all the results. i can // range from 0 to total_part_count() - 1. If i is not in that range, // aborts the program. const TestPartResult& TestResult::GetTestPartResult(int i) const { - if (i < 0 || i >= total_part_count()) - internal::posix::Abort(); + if (i < 0 || i >= total_part_count()) internal::posix::Abort(); return test_part_results_.at(static_cast<size_t>(i)); } @@ -2218,15 +2226,12 @@ const TestPartResult& TestResult::GetTestPartResult(int i) const { // test_property_count() - 1. If i is not in that range, aborts the // program. const TestProperty& TestResult::GetTestProperty(int i) const { - if (i < 0 || i >= test_property_count()) - internal::posix::Abort(); + if (i < 0 || i >= test_property_count()) internal::posix::Abort(); return test_properties_.at(static_cast<size_t>(i)); } // Clears the test part results. -void TestResult::ClearTestPartResults() { - test_part_results_.clear(); -} +void TestResult::ClearTestPartResults() { test_part_results_.clear(); } // Adds a test part result to the list. void TestResult::AddTestPartResult(const TestPartResult& test_part_result) { @@ -2255,15 +2260,8 @@ void TestResult::RecordProperty(const std::string& xml_element, // The list of reserved attributes used in the <testsuites> element of XML // output. static const char* const kReservedTestSuitesAttributes[] = { - "disabled", - "errors", - "failures", - "name", - "random_seed", - "tests", - "time", - "timestamp" -}; + "disabled", "errors", "failures", "name", + "random_seed", "tests", "time", "timestamp"}; // The list of reserved attributes used in the <testsuite> element of XML // output. @@ -2273,8 +2271,8 @@ static const char* const kReservedTestSuiteAttributes[] = { // The list of reserved attributes used in the <testcase> element of XML output. static const char* const kReservedTestCaseAttributes[] = { - "classname", "name", "status", "time", "type_param", - "value_param", "file", "line"}; + "classname", "name", "status", "time", + "type_param", "value_param", "file", "line"}; // Use a slightly different set for allowed output to ensure existing tests can // still RecordProperty("result") or "RecordProperty(timestamp") @@ -2336,7 +2334,7 @@ static bool ValidateTestPropertyName( const std::string& property_name, const std::vector<std::string>& reserved_names) { if (std::find(reserved_names.begin(), reserved_names.end(), property_name) != - reserved_names.end()) { + reserved_names.end()) { ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name << " (" << FormatWordList(reserved_names) << " are reserved by " << GTEST_NAME_ << ")"; @@ -2374,8 +2372,7 @@ bool TestResult::Skipped() const { // Returns true if and only if the test failed. bool TestResult::Failed() const { for (int i = 0; i < total_part_count(); ++i) { - if (GetTestPartResult(i).failed()) - return true; + if (GetTestPartResult(i).failed()) return true; } return false; } @@ -2416,27 +2413,22 @@ int TestResult::test_property_count() const { // Creates a Test object. // The c'tor saves the states of all flags. -Test::Test() - : gtest_flag_saver_(new GTEST_FLAG_SAVER_) { -} +Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {} // The d'tor restores the states of all flags. The actual work is // done by the d'tor of the gtest_flag_saver_ field, and thus not // visible here. -Test::~Test() { -} +Test::~Test() {} // Sets up the test fixture. // // A sub-class may override this. -void Test::SetUp() { -} +void Test::SetUp() {} // Tears down the test fixture. // // A sub-class may override this. -void Test::TearDown() { -} +void Test::TearDown() {} // Allows user supplied key value pairs to be recorded for later output. void Test::RecordProperty(const std::string& key, const std::string& value) { @@ -2541,8 +2533,8 @@ bool Test::HasSameFixtureClass() { static std::string* FormatSehExceptionMessage(DWORD exception_code, const char* location) { Message message; - message << "SEH exception with code 0x" << std::setbase(16) << - exception_code << std::setbase(10) << " thrown in " << location << "."; + message << "SEH exception with code 0x" << std::setbase(16) << exception_code + << std::setbase(10) << " thrown in " << location << "."; return new std::string(message.GetString()); } @@ -2585,8 +2577,8 @@ GoogleTestFailureException::GoogleTestFailureException( // exceptions in the same function. Therefore, we provide a separate // wrapper function for handling SEH exceptions.) template <class T, typename Result> -Result HandleSehExceptionsInMethodIfSupported( - T* object, Result (T::*method)(), const char* location) { +Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(), + const char* location) { #if GTEST_HAS_SEH __try { return (object->*method)(); @@ -2595,8 +2587,8 @@ Result HandleSehExceptionsInMethodIfSupported( // We create the exception message on the heap because VC++ prohibits // creation of objects with destructors on stack in functions using __try // (see error C2712). - std::string* exception_message = FormatSehExceptionMessage( - GetExceptionCode(), location); + std::string* exception_message = + FormatSehExceptionMessage(GetExceptionCode(), location); internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure, *exception_message); delete exception_message; @@ -2612,8 +2604,8 @@ Result HandleSehExceptionsInMethodIfSupported( // exceptions, if they are supported; returns the 0-value for type // Result in case of an SEH exception. template <class T, typename Result> -Result HandleExceptionsInMethodIfSupported( - T* object, Result (T::*method)(), const char* location) { +Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(), + const char* location) { // NOTE: The user code can affect the way in which Google Test handles // exceptions by setting GTEST_FLAG(catch_exceptions), but only before // RUN_ALL_TESTS() starts. It is technically possible to check the flag @@ -2623,7 +2615,7 @@ Result HandleExceptionsInMethodIfSupported( // try { // // Perform the test method. // } catch (...) { - // if (GTEST_FLAG(catch_exceptions)) + // if (GTEST_FLAG_GET(catch_exceptions)) // // Report the exception as failure. // else // throw; // Re-throws the original exception. @@ -2679,16 +2671,16 @@ void Test::Run() { // GTEST_SKIP(). if (!HasFatalFailure() && !IsSkipped()) { impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &Test::TestBody, "the test body"); + internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody, + "the test body"); } // However, we want to clean up as much as possible. Hence we will // always call TearDown(), even if SetUp() or the test body has // failed. impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &Test::TearDown, "TearDown()"); + internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown, + "TearDown()"); } // Returns true if and only if the current test has a fatal failure. @@ -2698,8 +2690,9 @@ bool Test::HasFatalFailure() { // Returns true if and only if the current test has a non-fatal failure. bool Test::HasNonfatalFailure() { - return internal::GetUnitTestImpl()->current_test_result()-> - HasNonfatalFailure(); + return internal::GetUnitTestImpl() + ->current_test_result() + ->HasNonfatalFailure(); } // Returns true if and only if the current test was skipped. @@ -2799,11 +2792,10 @@ class TestNameIs { // Constructor. // // TestNameIs has NO default constructor. - explicit TestNameIs(const char* name) - : name_(name) {} + explicit TestNameIs(const char* name) : name_(name) {} // Returns true if and only if the test name of test_info matches name_. - bool operator()(const TestInfo * test_info) const { + bool operator()(const TestInfo* test_info) const { return test_info && test_info->name() == name_; } @@ -2831,20 +2823,20 @@ void UnitTestImpl::RegisterParameterizedTests() { // Creates the test object, runs it, records its result, and then // deletes it. void TestInfo::Run() { - if (!should_run_) return; + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + if (!should_run_) { + if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this); + return; + } // Tells UnitTest where to store test result. internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); impl->set_current_test_info(this); - TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); - // Notifies the unit test event listeners that a test is about to start. repeater->OnTestStart(*this); - result_.set_start_timestamp(internal::GetTimeInMillis()); internal::Timer timer; - impl->os_stack_trace_getter()->UponLeavingGTest(); // Creates the test object. @@ -3009,11 +3001,18 @@ void TestSuite::Run() { internal::HandleExceptionsInMethodIfSupported( this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()"); + const bool skip_all = ad_hoc_test_result().Failed(); + start_timestamp_ = internal::GetTimeInMillis(); internal::Timer timer; for (int i = 0; i < total_test_count(); i++) { - GetMutableTestInfo(i)->Run(); - if (GTEST_FLAG(fail_fast) && GetMutableTestInfo(i)->result()->Failed()) { + if (skip_all) { + GetMutableTestInfo(i)->Skip(); + } else { + GetMutableTestInfo(i)->Run(); + } + if (GTEST_FLAG_GET(fail_fast) && + GetMutableTestInfo(i)->result()->Failed()) { for (int j = i + 1; j < total_test_count(); j++) { GetMutableTestInfo(j)->Skip(); } @@ -3089,11 +3088,10 @@ void TestSuite::UnshuffleTests() { // // FormatCountableNoun(1, "formula", "formuli") returns "1 formula". // FormatCountableNoun(5, "book", "books") returns "5 books". -static std::string FormatCountableNoun(int count, - const char * singular_form, - const char * plural_form) { +static std::string FormatCountableNoun(int count, const char* singular_form, + const char* plural_form) { return internal::StreamableToString(count) + " " + - (count == 1 ? singular_form : plural_form); + (count == 1 ? singular_form : plural_form); } // Formats the count of tests. @@ -3110,7 +3108,7 @@ static std::string FormatTestSuiteCount(int test_suite_count) { // representation. Both kNonFatalFailure and kFatalFailure are translated // to "Failure", as the user usually doesn't care about the difference // between the two when viewing the test result. -static const char * TestPartResultTypeToString(TestPartResult::Type type) { +static const char* TestPartResultTypeToString(TestPartResult::Type type) { switch (type) { case TestPartResult::kSkip: return "Skipped\n"; @@ -3137,17 +3135,18 @@ enum class GTestColor { kDefault, kRed, kGreen, kYellow }; // Prints a TestPartResult to an std::string. static std::string PrintTestPartResultToString( const TestPartResult& test_part_result) { - return (Message() - << internal::FormatFileLocation(test_part_result.file_name(), - test_part_result.line_number()) - << " " << TestPartResultTypeToString(test_part_result.type()) - << test_part_result.message()).GetString(); + return (Message() << internal::FormatFileLocation( + test_part_result.file_name(), + test_part_result.line_number()) + << " " + << TestPartResultTypeToString(test_part_result.type()) + << test_part_result.message()) + .GetString(); } // Prints a TestPartResult. static void PrintTestPartResult(const TestPartResult& test_part_result) { - const std::string& result = - PrintTestPartResultToString(test_part_result); + const std::string& result = PrintTestPartResultToString(test_part_result); printf("%s\n", result.c_str()); fflush(stdout); // If the test program runs in Visual Studio or a debugger, the @@ -3164,8 +3163,8 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) { } // class PrettyUnitTestResultPrinter -#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \ - !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW // Returns the character attribute for the given color. static WORD GetColorAttribute(GTestColor color) { @@ -3176,7 +3175,8 @@ static WORD GetColorAttribute(GTestColor color) { return FOREGROUND_GREEN; case GTestColor::kYellow: return FOREGROUND_RED | FOREGROUND_GREEN; - default: return 0; + default: + return 0; } } @@ -3232,7 +3232,8 @@ static const char* GetAnsiColorCode(GTestColor color) { // Returns true if and only if Google Test should use colors in the output. bool ShouldUseColor(bool stdout_is_tty) { - const char* const gtest_color = GTEST_FLAG(color).c_str(); + std::string c = GTEST_FLAG_GET(color); + const char* const gtest_color = c.c_str(); if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW @@ -3259,9 +3260,9 @@ bool ShouldUseColor(bool stdout_is_tty) { } return String::CaseInsensitiveCStringEquals(gtest_color, "yes") || - String::CaseInsensitiveCStringEquals(gtest_color, "true") || - String::CaseInsensitiveCStringEquals(gtest_color, "t") || - String::CStringEquals(gtest_color, "1"); + String::CaseInsensitiveCStringEquals(gtest_color, "true") || + String::CaseInsensitiveCStringEquals(gtest_color, "t") || + String::CStringEquals(gtest_color, "1"); // We take "yes", "true", "t", and "1" as meaning "yes". If the // value is neither one of these nor "auto", we treat it as "no" to // be conservative. @@ -3273,18 +3274,13 @@ bool ShouldUseColor(bool stdout_is_tty) { // that would be colored when printed, as can be done on Linux. GTEST_ATTRIBUTE_PRINTF_(2, 3) -static void ColoredPrintf(GTestColor color, const char *fmt, ...) { +static void ColoredPrintf(GTestColor color, const char* fmt, ...) { va_list args; va_start(args, fmt); -#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \ - GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM) - const bool use_color = AlwaysFalse(); -#else static const bool in_color_mode = ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); const bool use_color = in_color_mode && (color != GTestColor::kDefault); -#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS if (!use_color) { vprintf(fmt, args); @@ -3292,8 +3288,8 @@ static void ColoredPrintf(GTestColor color, const char *fmt, ...) { return; } -#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \ - !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); // Gets the current text color. @@ -3364,6 +3360,7 @@ class PrettyUnitTestResultPrinter : public TestEventListener { #endif // OnTestCaseStart void OnTestStart(const TestInfo& test_info) override; + void OnTestDisabled(const TestInfo& test_info) override; void OnTestPartResult(const TestPartResult& result) override; void OnTestEnd(const TestInfo& test_info) override; @@ -3384,13 +3381,14 @@ class PrettyUnitTestResultPrinter : public TestEventListener { static void PrintSkippedTests(const UnitTest& unit_test); }; - // Fired before each iteration of tests starts. +// Fired before each iteration of tests starts. void PrettyUnitTestResultPrinter::OnTestIterationStart( const UnitTest& unit_test, int iteration) { - if (GTEST_FLAG(repeat) != 1) + if (GTEST_FLAG_GET(repeat) != 1) printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1); - const char* const filter = GTEST_FLAG(filter).c_str(); + std::string f = GTEST_FLAG_GET(filter); + const char* const filter = f.c_str(); // Prints the filter if it's not *. This reminds the user that some // tests may be skipped. @@ -3406,7 +3404,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart( internal::posix::GetEnv(kTestTotalShards)); } - if (GTEST_FLAG(shuffle)) { + if (GTEST_FLAG_GET(shuffle)) { ColoredPrintf(GTestColor::kYellow, "Note: Randomizing tests' orders with a seed of %d .\n", unit_test.random_seed()); @@ -3462,6 +3460,13 @@ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { fflush(stdout); } +void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) { + ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] "); + PrintTestName(test_info.test_suite_name(), test_info.name()); + printf("\n"); + fflush(stdout); +} + // Called after an assertion failure. void PrettyUnitTestResultPrinter::OnTestPartResult( const TestPartResult& result) { @@ -3486,12 +3491,12 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); } PrintTestName(test_info.test_suite_name(), test_info.name()); - if (test_info.result()->Failed()) - PrintFullTestCommentIfPresent(test_info); + if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info); - if (GTEST_FLAG(print_time)) { - printf(" (%s ms)\n", internal::StreamableToString( - test_info.result()->elapsed_time()).c_str()); + if (GTEST_FLAG_GET(print_time)) { + printf(" (%s ms)\n", + internal::StreamableToString(test_info.result()->elapsed_time()) + .c_str()); } else { printf("\n"); } @@ -3500,7 +3505,7 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { - if (!GTEST_FLAG(print_time)) return; + if (!GTEST_FLAG_GET(print_time)) return; const std::string counts = FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); @@ -3511,7 +3516,7 @@ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { } #else void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) { - if (!GTEST_FLAG(print_time)) return; + if (!GTEST_FLAG_GET(print_time)) return; const std::string counts = FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); @@ -3607,7 +3612,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, printf("%s from %s ran.", FormatTestCount(unit_test.test_to_run_count()).c_str(), FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); - if (GTEST_FLAG(print_time)) { + if (GTEST_FLAG_GET(print_time)) { printf(" (%s ms total)", internal::StreamableToString(unit_test.elapsed_time()).c_str()); } @@ -3628,7 +3633,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, } int num_disabled = unit_test.reportable_disabled_test_count(); - if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { + if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) { if (unit_test.Passed()) { printf("\n"); // Add a spacer if no FAILURE banner is displayed. } @@ -3664,6 +3669,7 @@ class BriefUnitTestResultPrinter : public TestEventListener { #endif // OnTestCaseStart void OnTestStart(const TestInfo& /*test_info*/) override {} + void OnTestDisabled(const TestInfo& /*test_info*/) override {} void OnTestPartResult(const TestPartResult& result) override; void OnTestEnd(const TestInfo& test_info) override; @@ -3700,7 +3706,7 @@ void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { PrintTestName(test_info.test_suite_name(), test_info.name()); PrintFullTestCommentIfPresent(test_info); - if (GTEST_FLAG(print_time)) { + if (GTEST_FLAG_GET(print_time)) { printf(" (%s ms)\n", internal::StreamableToString(test_info.result()->elapsed_time()) .c_str()); @@ -3717,7 +3723,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, printf("%s from %s ran.", FormatTestCount(unit_test.test_to_run_count()).c_str(), FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); - if (GTEST_FLAG(print_time)) { + if (GTEST_FLAG_GET(print_time)) { printf(" (%s ms total)", internal::StreamableToString(unit_test.elapsed_time()).c_str()); } @@ -3732,7 +3738,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, } int num_disabled = unit_test.reportable_disabled_test_count(); - if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { + if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) { if (unit_test.Passed()) { printf("\n"); // Add a spacer if no FAILURE banner is displayed. } @@ -3752,7 +3758,7 @@ class TestEventRepeater : public TestEventListener { public: TestEventRepeater() : forwarding_enabled_(true) {} ~TestEventRepeater() override; - void Append(TestEventListener *listener); + void Append(TestEventListener* listener); TestEventListener* Release(TestEventListener* listener); // Controls whether events will be forwarded to listeners_. Set to false @@ -3770,6 +3776,7 @@ class TestEventRepeater : public TestEventListener { #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ void OnTestSuiteStart(const TestSuite& parameter) override; void OnTestStart(const TestInfo& test_info) override; + void OnTestDisabled(const TestInfo& test_info) override; void OnTestPartResult(const TestPartResult& result) override; void OnTestEnd(const TestInfo& test_info) override; // Legacy API is deprecated but still available @@ -3789,18 +3796,19 @@ class TestEventRepeater : public TestEventListener { // The list of listeners that receive events. std::vector<TestEventListener*> listeners_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater); + TestEventRepeater(const TestEventRepeater&) = delete; + TestEventRepeater& operator=(const TestEventRepeater&) = delete; }; TestEventRepeater::~TestEventRepeater() { ForEach(listeners_, Delete<TestEventListener>); } -void TestEventRepeater::Append(TestEventListener *listener) { +void TestEventRepeater::Append(TestEventListener* listener) { listeners_.push_back(listener); } -TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { +TestEventListener* TestEventRepeater::Release(TestEventListener* listener) { for (size_t i = 0; i < listeners_.size(); ++i) { if (listeners_[i] == listener) { listeners_.erase(listeners_.begin() + static_cast<int>(i)); @@ -3813,14 +3821,14 @@ TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { // Since most methods are very similar, use macros to reduce boilerplate. // This defines a member that forwards the call to all listeners. -#define GTEST_REPEATER_METHOD_(Name, Type) \ -void TestEventRepeater::Name(const Type& parameter) { \ - if (forwarding_enabled_) { \ - for (size_t i = 0; i < listeners_.size(); i++) { \ - listeners_[i]->Name(parameter); \ - } \ - } \ -} +#define GTEST_REPEATER_METHOD_(Name, Type) \ + void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (size_t i = 0; i < listeners_.size(); i++) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ + } // This defines a member that forwards the call to all listeners in reverse // order. #define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ @@ -3840,6 +3848,7 @@ GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite) #endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite) GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) +GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo) GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) @@ -3890,12 +3899,13 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener { private: // Is c a whitespace character that is normalized to a space character // when it appears in an XML attribute value? - static bool IsNormalizableWhitespace(char c) { - return c == 0x9 || c == 0xA || c == 0xD; + static bool IsNormalizableWhitespace(unsigned char c) { + return c == '\t' || c == '\n' || c == '\r'; } // May c appear in a well-formed XML document? - static bool IsValidXmlCharacter(char c) { + // https://www.w3.org/TR/REC-xml/#charsets + static bool IsValidXmlCharacter(unsigned char c) { return IsNormalizableWhitespace(c) || c >= 0x20; } @@ -3965,7 +3975,8 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener { // The output file. const std::string output_file_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter); + XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete; + XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete; }; // Creates a new XmlUnitTestResultPrinter. @@ -4005,8 +4016,8 @@ void XmlUnitTestResultPrinter::ListTestsMatchingFilter( // module will consist of ordinary English text. // If this module is ever modified to produce version 1.1 XML output, // most invalid characters can be retained using character references. -std::string XmlUnitTestResultPrinter::EscapeXml( - const std::string& str, bool is_attribute) { +std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str, + bool is_attribute) { Message m; for (size_t i = 0; i < str.size(); ++i) { @@ -4034,8 +4045,9 @@ std::string XmlUnitTestResultPrinter::EscapeXml( m << '"'; break; default: - if (IsValidXmlCharacter(ch)) { - if (is_attribute && IsNormalizableWhitespace(ch)) + if (IsValidXmlCharacter(static_cast<unsigned char>(ch))) { + if (is_attribute && + IsNormalizableWhitespace(static_cast<unsigned char>(ch))) m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch)) << ";"; else @@ -4056,7 +4068,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( std::string output; output.reserve(str.size()); for (std::string::const_iterator it = str.begin(); it != str.end(); ++it) - if (IsValidXmlCharacter(*it)) + if (IsValidXmlCharacter(static_cast<unsigned char>(*it))) output.push_back(*it); return output; @@ -4064,7 +4076,6 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( // The following routines generate an XML representation of a UnitTest // object. -// GOOGLETEST_CM0009 DO NOT DELETE // // This is how Google Test concepts map to the DTD: // @@ -4113,12 +4124,12 @@ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) { return ""; // YYYY-MM-DDThh:mm:ss.sss return StreamableToString(time_struct.tm_year + 1900) + "-" + - String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + - String::FormatIntWidth2(time_struct.tm_mday) + "T" + - String::FormatIntWidth2(time_struct.tm_hour) + ":" + - String::FormatIntWidth2(time_struct.tm_min) + ":" + - String::FormatIntWidth2(time_struct.tm_sec) + "." + - String::FormatIntWidthN(static_cast<int>(ms % 1000), 3); + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec) + "." + + String::FormatIntWidthN(static_cast<int>(ms % 1000), 3); } // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. @@ -4129,8 +4140,8 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, for (;;) { const char* const next_segment = strstr(segment, "]]>"); if (next_segment != nullptr) { - stream->write( - segment, static_cast<std::streamsize>(next_segment - segment)); + stream->write(segment, + static_cast<std::streamsize>(next_segment - segment)); *stream << "]]>]]><![CDATA["; segment = next_segment + strlen("]]>"); } else { @@ -4142,15 +4153,13 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, } void XmlUnitTestResultPrinter::OutputXmlAttribute( - std::ostream* stream, - const std::string& element_name, - const std::string& name, - const std::string& value) { + std::ostream* stream, const std::string& element_name, + const std::string& name, const std::string& value) { const std::vector<std::string>& allowed_names = GetReservedOutputAttributesForElement(element_name); GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != - allowed_names.end()) + allowed_names.end()) << "Attribute " << name << " is not allowed for element <" << element_name << ">."; @@ -4216,10 +4225,11 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, OutputXmlAttribute(stream, kTestsuite, "type_param", test_info.type_param()); } - if (GTEST_FLAG(list_tests)) { - OutputXmlAttribute(stream, kTestsuite, "file", test_info.file()); - OutputXmlAttribute(stream, kTestsuite, "line", - StreamableToString(test_info.line())); + + OutputXmlAttribute(stream, kTestsuite, "file", test_info.file()); + OutputXmlAttribute(stream, kTestsuite, "line", + StreamableToString(test_info.line())); + if (GTEST_FLAG_GET(list_tests)) { *stream << " />\n"; return; } @@ -4254,8 +4264,7 @@ void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream, internal::FormatCompilerIndependentFileLocation(part.file_name(), part.line_number()); const std::string summary = location + "\n" + part.summary(); - *stream << " <failure message=\"" - << EscapeXmlAttribute(summary) + *stream << " <failure message=\"" << EscapeXmlAttribute(summary) << "\" type=\"\">"; const std::string detail = location + "\n" + part.message(); OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); @@ -4295,7 +4304,7 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream, OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name()); OutputXmlAttribute(stream, kTestsuite, "tests", StreamableToString(test_suite.reportable_test_count())); - if (!GTEST_FLAG(list_tests)) { + if (!GTEST_FLAG_GET(list_tests)) { OutputXmlAttribute(stream, kTestsuite, "failures", StreamableToString(test_suite.failed_test_count())); OutputXmlAttribute( @@ -4343,7 +4352,7 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, stream, kTestsuites, "timestamp", FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp())); - if (GTEST_FLAG(shuffle)) { + if (GTEST_FLAG_GET(shuffle)) { OutputXmlAttribute(stream, kTestsuites, "random_seed", StreamableToString(unit_test.random_seed())); } @@ -4396,7 +4405,7 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( for (int i = 0; i < result.test_property_count(); ++i) { const TestProperty& property = result.GetTestProperty(i); attributes << " " << property.key() << "=" - << "\"" << EscapeXmlAttribute(property.value()) << "\""; + << "\"" << EscapeXmlAttribute(property.value()) << "\""; } return attributes.GetString(); } @@ -4410,15 +4419,15 @@ void XmlUnitTestResultPrinter::OutputXmlTestProperties( return; } - *stream << "<" << kProperties << ">\n"; + *stream << " <" << kProperties << ">\n"; for (int i = 0; i < result.test_property_count(); ++i) { const TestProperty& property = result.GetTestProperty(i); - *stream << "<" << kProperty; + *stream << " <" << kProperty; *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\""; *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\""; *stream << "/>\n"; } - *stream << "</" << kProperties << ">\n"; + *stream << " </" << kProperties << ">\n"; } // End XmlUnitTestResultPrinter @@ -4442,16 +4451,12 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener { //// streams the attribute as JSON. static void OutputJsonKey(std::ostream* stream, const std::string& element_name, - const std::string& name, - const std::string& value, - const std::string& indent, - bool comma = true); + const std::string& name, const std::string& value, + const std::string& indent, bool comma = true); static void OutputJsonKey(std::ostream* stream, const std::string& element_name, - const std::string& name, - int value, - const std::string& indent, - bool comma = true); + const std::string& name, int value, + const std::string& indent, bool comma = true); // Streams a test suite JSON stanza containing the given test result. // @@ -4484,7 +4489,9 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener { // The output file. const std::string output_file_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter); + JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete; + JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) = + delete; }; // Creates a new JsonUnitTestResultPrinter. @@ -4496,7 +4503,7 @@ JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file) } void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, - int /*iteration*/) { + int /*iteration*/) { FILE* jsonout = OpenFileForWriting(output_file_); std::stringstream stream; PrintJsonUnitTest(&stream, unit_test); @@ -4562,55 +4569,48 @@ static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) { return ""; // YYYY-MM-DDThh:mm:ss return StreamableToString(time_struct.tm_year + 1900) + "-" + - String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + - String::FormatIntWidth2(time_struct.tm_mday) + "T" + - String::FormatIntWidth2(time_struct.tm_hour) + ":" + - String::FormatIntWidth2(time_struct.tm_min) + ":" + - String::FormatIntWidth2(time_struct.tm_sec) + "Z"; + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec) + "Z"; } static inline std::string Indent(size_t width) { return std::string(width, ' '); } -void JsonUnitTestResultPrinter::OutputJsonKey( - std::ostream* stream, - const std::string& element_name, - const std::string& name, - const std::string& value, - const std::string& indent, - bool comma) { +void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value, + const std::string& indent, + bool comma) { const std::vector<std::string>& allowed_names = GetReservedOutputAttributesForElement(element_name); GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != - allowed_names.end()) + allowed_names.end()) << "Key \"" << name << "\" is not allowed for value \"" << element_name << "\"."; *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\""; - if (comma) - *stream << ",\n"; + if (comma) *stream << ",\n"; } void JsonUnitTestResultPrinter::OutputJsonKey( - std::ostream* stream, - const std::string& element_name, - const std::string& name, - int value, - const std::string& indent, - bool comma) { + std::ostream* stream, const std::string& element_name, + const std::string& name, int value, const std::string& indent, bool comma) { const std::vector<std::string>& allowed_names = GetReservedOutputAttributesForElement(element_name); GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != - allowed_names.end()) + allowed_names.end()) << "Key \"" << name << "\" is not allowed for value \"" << element_name << "\"."; *stream << indent << "\"" << name << "\": " << StreamableToString(value); - if (comma) - *stream << ",\n"; + if (comma) *stream << ",\n"; } // Streams a test suite JSON stanza containing the given test result. @@ -4620,7 +4620,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult( *stream << Indent(4) << "{\n"; OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6)); OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6)); - if (!GTEST_FLAG(list_tests)) { + if (!GTEST_FLAG_GET(list_tests)) { OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6)); OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6)); OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6)); @@ -4674,11 +4674,14 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream, OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(), kIndent); } - if (GTEST_FLAG(list_tests)) { - OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent); - OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false); + + OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent); + OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false); + if (GTEST_FLAG_GET(list_tests)) { *stream << "\n" << Indent(8) << "}"; return; + } else { + *stream << ",\n"; } OutputJsonKey(stream, kTestsuite, "status", @@ -4710,7 +4713,9 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream, if (part.failed()) { *stream << ",\n"; if (++failures == 1) { - *stream << kIndent << "\"" << "failures" << "\": [\n"; + *stream << kIndent << "\"" + << "failures" + << "\": [\n"; } const std::string location = internal::FormatCompilerIndependentFileLocation(part.file_name(), @@ -4723,8 +4728,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream, } } - if (failures > 0) - *stream << "\n" << kIndent << "]"; + if (failures > 0) *stream << "\n" << kIndent << "]"; *stream << "\n" << Indent(8) << "}"; } @@ -4738,7 +4742,7 @@ void JsonUnitTestResultPrinter::PrintJsonTestSuite( OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent); OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(), kIndent); - if (!GTEST_FLAG(list_tests)) { + if (!GTEST_FLAG_GET(list_tests)) { OutputJsonKey(stream, kTestsuite, "failures", test_suite.failed_test_count(), kIndent); OutputJsonKey(stream, kTestsuite, "disabled", @@ -4785,7 +4789,7 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream, OutputJsonKey(stream, kTestsuites, "disabled", unit_test.reportable_disabled_test_count(), kIndent); OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent); - if (GTEST_FLAG(shuffle)) { + if (GTEST_FLAG_GET(shuffle)) { OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(), kIndent); } @@ -4820,7 +4824,9 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream, OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result()); } - *stream << "\n" << kIndent << "]\n" << "}\n"; + *stream << "\n" + << kIndent << "]\n" + << "}\n"; } void JsonUnitTestResultPrinter::PrintJsonTestList( @@ -4855,7 +4861,8 @@ std::string JsonUnitTestResultPrinter::TestPropertiesAsJson( Message attributes; for (int i = 0; i < result.test_property_count(); ++i) { const TestProperty& property = result.GetTestProperty(i); - attributes << ",\n" << indent << "\"" << property.key() << "\": " + attributes << ",\n" + << indent << "\"" << property.key() << "\": " << "\"" << EscapeJson(property.value()) << "\""; } return attributes.GetString(); @@ -4895,14 +4902,14 @@ void StreamingListener::SocketWriter::MakeConnection() { addrinfo hints; memset(&hints, 0, sizeof(hints)); - hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. + hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. hints.ai_socktype = SOCK_STREAM; addrinfo* servinfo = nullptr; // Use the getaddrinfo() to get a linked list of IP addresses for // the given host name. - const int error_num = getaddrinfo( - host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); + const int error_num = + getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); if (error_num != 0) { GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: " << gai_strerror(error_num); @@ -4911,8 +4918,8 @@ void StreamingListener::SocketWriter::MakeConnection() { // Loop through all the results and connect to the first we can. for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr; cur_addr = cur_addr->ai_next) { - sockfd_ = socket( - cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol); + sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype, + cur_addr->ai_protocol); if (sockfd_ != -1) { // Connect the client socket to the server socket. if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) { @@ -4962,7 +4969,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count) for (int i = 0; i < raw_stack_size; ++i) { if (raw_stack[i] == caller_frame && - !GTEST_FLAG(show_internal_stack_frames)) { + !GTEST_FLAG_GET(show_internal_stack_frames)) { // Add a marker to the trace and stop adding frames. absl::StrAppend(&result, kElidedFramesMarker, "\n"); break; @@ -4981,7 +4988,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count) return result; -#else // !GTEST_HAS_ABSL +#else // !GTEST_HAS_ABSL static_cast<void>(max_depth); static_cast<void>(skip_count); return ""; @@ -5005,14 +5012,14 @@ void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) { class ScopedPrematureExitFile { public: explicit ScopedPrematureExitFile(const char* premature_exit_filepath) - : premature_exit_filepath_(premature_exit_filepath ? - premature_exit_filepath : "") { + : premature_exit_filepath_( + premature_exit_filepath ? premature_exit_filepath : "") { // If a path to the premature-exit file is specified... if (!premature_exit_filepath_.empty()) { // create the file with a single "0" character in it. I/O // errors are ignored as there's nothing better we can do and we // don't want to fail the test because of this. - FILE* pfile = posix::FOpen(premature_exit_filepath, "w"); + FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w"); fwrite("0", 1, 1, pfile); fclose(pfile); } @@ -5034,7 +5041,8 @@ class ScopedPrematureExitFile { private: const std::string premature_exit_filepath_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile); + ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete; + ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete; }; } // namespace internal @@ -5208,7 +5216,7 @@ int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); } // Gets the time of the test program start, in ms from the start of the // UNIX epoch. internal::TimeInMillis UnitTest::start_timestamp() const { - return impl()->start_timestamp(); + return impl()->start_timestamp(); } // Gets the elapsed time, in milliseconds. @@ -5251,9 +5259,7 @@ TestSuite* UnitTest::GetMutableTestSuite(int i) { // Returns the list of event listeners that can be used to track events // inside Google Test. -TestEventListeners& UnitTest::listeners() { - return *impl()->listeners(); -} +TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); } // Registers and returns a global test environment. When a test // program is run, all global test environments will be set-up in the @@ -5278,12 +5284,11 @@ Environment* UnitTest::AddEnvironment(Environment* env) { // assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call // this to report their results. The user code should use the // assertion macros instead of calling this directly. -void UnitTest::AddTestPartResult( - TestPartResult::Type result_type, - const char* file_name, - int line_number, - const std::string& message, - const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) { +void UnitTest::AddTestPartResult(TestPartResult::Type result_type, + const char* file_name, int line_number, + const std::string& message, + const std::string& os_stack_trace) + GTEST_LOCK_EXCLUDED_(mutex_) { Message msg; msg << message; @@ -5293,8 +5298,9 @@ void UnitTest::AddTestPartResult( for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) { const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1]; - msg << "\n" << internal::FormatFileLocation(trace.file, trace.line) - << " " << trace.message; + msg << "\n" + << internal::FormatFileLocation(trace.file, trace.line) << " " + << trace.message; } } @@ -5304,8 +5310,8 @@ void UnitTest::AddTestPartResult( const TestPartResult result = TestPartResult( result_type, file_name, line_number, msg.GetString().c_str()); - impl_->GetTestPartResultReporterForCurrentThread()-> - ReportTestPartResult(result); + impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult( + result); if (result_type != TestPartResult::kSuccess && result_type != TestPartResult::kSkip) { @@ -5314,7 +5320,7 @@ void UnitTest::AddTestPartResult( // in the code (perhaps in order to use Google Test assertions // with another testing framework) and specify the former on the // command line for debugging. - if (GTEST_FLAG(break_on_failure)) { + if (GTEST_FLAG_GET(break_on_failure)) { #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT // Using DebugBreak on Windows allows gtest to still break into a debugger // when a failure happens and both the --gtest_break_on_failure and @@ -5331,7 +5337,7 @@ void UnitTest::AddTestPartResult( // portability: some debuggers don't correctly trap abort(). *static_cast<volatile int*>(nullptr) = 1; #endif // GTEST_OS_WINDOWS - } else if (GTEST_FLAG(throw_on_failure)) { + } else if (GTEST_FLAG_GET(throw_on_failure)) { #if GTEST_HAS_EXCEPTIONS throw internal::GoogleTestFailureException(result); #else @@ -5360,7 +5366,7 @@ void UnitTest::RecordProperty(const std::string& key, // from the main thread. int UnitTest::Run() { const bool in_death_test_child_process = - internal::GTEST_FLAG(internal_run_death_test).length() > 0; + GTEST_FLAG_GET(internal_run_death_test).length() > 0; // Google Test implements this protocol for catching that a test // program exits before returning control to Google Test: @@ -5390,7 +5396,7 @@ int UnitTest::Run() { // Captures the value of GTEST_FLAG(catch_exceptions). This value will be // used for the duration of the program. - impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions)); + impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions)); #if GTEST_OS_WINDOWS // Either the user wants Google Test to catch exceptions thrown by the @@ -5398,26 +5404,26 @@ int UnitTest::Run() { // process. In either case the user does not want to see pop-up dialogs // about crashes - they are expected. if (impl()->catch_exceptions() || in_death_test_child_process) { -# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT // SetErrorMode doesn't exist on CE. SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT | SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX); -# endif // !GTEST_OS_WINDOWS_MOBILE +#endif // !GTEST_OS_WINDOWS_MOBILE -# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE +#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE // Death test children can be terminated with _abort(). On Windows, // _abort() can show a dialog with a warning message. This forces the // abort message to go to stderr instead. _set_error_mode(_OUT_TO_STDERR); -# endif +#endif -# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE +#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE // In the debug version, Visual Studio pops up a separate dialog // offering a choice to debug the aborted program. We need to suppress // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement // executed. Google Test will notify the user of any unexpected // failure via stderr. - if (!GTEST_FLAG(break_on_failure)) + if (!GTEST_FLAG_GET(break_on_failure)) _set_abort_behavior( 0x0, // Clear the following flags: _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. @@ -5431,14 +5437,15 @@ int UnitTest::Run() { _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG); (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR); } -# endif +#endif } #endif // GTEST_OS_WINDOWS return internal::HandleExceptionsInMethodIfSupported( - impl(), - &internal::UnitTestImpl::RunAllTests, - "auxiliary test code (environments or event listeners)") ? 0 : 1; + impl(), &internal::UnitTestImpl::RunAllTests, + "auxiliary test code (environments or event listeners)") + ? 0 + : 1; } // Returns the working directory when the first TEST() or TEST_F() was @@ -5483,14 +5490,10 @@ UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) { } // Creates an empty UnitTest. -UnitTest::UnitTest() { - impl_ = new internal::UnitTestImpl(this); -} +UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); } // Destructor of UnitTest. -UnitTest::~UnitTest() { - delete impl_; -} +UnitTest::~UnitTest() { delete impl_; } // Pushes a trace defined by SCOPED_TRACE() on to the per-thread // Google Test trace stack. @@ -5501,8 +5504,7 @@ void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) } // Pops a trace from the per-thread Google Test trace stack. -void UnitTest::PopGTestTrace() - GTEST_LOCK_EXCLUDED_(mutex_) { +void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) { internal::MutexLock lock(&mutex_); impl_->gtest_trace_stack().pop_back(); } @@ -5599,12 +5601,12 @@ void UnitTestImpl::ConfigureXmlOutput() { // Initializes event listeners for streaming test results in string form. // Must not be called before InitGoogleTest. void UnitTestImpl::ConfigureStreamingOutput() { - const std::string& target = GTEST_FLAG(stream_result_to); + const std::string& target = GTEST_FLAG_GET(stream_result_to); if (!target.empty()) { const size_t pos = target.find(':'); if (pos != std::string::npos) { - listeners()->Append(new StreamingListener(target.substr(0, pos), - target.substr(pos+1))); + listeners()->Append( + new StreamingListener(target.substr(0, pos), target.substr(pos + 1))); } else { GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target << "\" ignored."; @@ -5642,7 +5644,7 @@ void UnitTestImpl::PostFlagParsingInit() { // to shut down the default XML output before invoking RUN_ALL_TESTS. ConfigureXmlOutput(); - if (GTEST_FLAG(brief)) { + if (GTEST_FLAG_GET(brief)) { listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter); } @@ -5652,7 +5654,7 @@ void UnitTestImpl::PostFlagParsingInit() { #endif // GTEST_CAN_STREAM_RESULTS_ #if GTEST_HAS_ABSL - if (GTEST_FLAG(install_failure_signal_handler)) { + if (GTEST_FLAG_GET(install_failure_signal_handler)) { absl::FailureSignalHandlerOptions options; absl::InstallFailureSignalHandler(options); } @@ -5710,9 +5712,9 @@ TestSuite* UnitTestImpl::GetTestSuite( auto* const new_test_suite = new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc); + const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter); // Is this a death test suite? - if (internal::UnitTestOptions::MatchesFilter(test_suite_name, - kDeathTestSuiteFilter)) { + if (death_test_suite_filter.MatchesName(test_suite_name)) { // Yes. Inserts the test suite after the last death test suite // defined so far. This only works when the test suites haven't // been shuffled. Otherwise we may end up running a death test @@ -5749,8 +5751,7 @@ bool UnitTestImpl::RunAllTests() { const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized(); // Do not run any test if the --help flag was specified. - if (g_help_flag) - return true; + if (g_help_flag) return true; // Repeats the call to the post-flag parsing initialization in case the // user didn't call InitGoogleTest. @@ -5768,11 +5769,11 @@ bool UnitTestImpl::RunAllTests() { #if GTEST_HAS_DEATH_TEST in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != nullptr); -# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) +#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) if (in_subprocess_for_death_test) { GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_(); } -# endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) +#endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) #endif // GTEST_HAS_DEATH_TEST const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex, @@ -5780,19 +5781,18 @@ bool UnitTestImpl::RunAllTests() { // Compares the full test names with the filter to decide which // tests to run. - const bool has_tests_to_run = FilterTests(should_shard - ? HONOR_SHARDING_PROTOCOL - : IGNORE_SHARDING_PROTOCOL) > 0; + const bool has_tests_to_run = + FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL + : IGNORE_SHARDING_PROTOCOL) > 0; // Lists the tests and exits if the --gtest_list_tests flag was specified. - if (GTEST_FLAG(list_tests)) { + if (GTEST_FLAG_GET(list_tests)) { // This must be called *after* FilterTests() has been called. ListTestsMatchingFilter(); return true; } - random_seed_ = GTEST_FLAG(shuffle) ? - GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0; + random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed)); // True if and only if at least one test has failed. bool failed = false; @@ -5804,9 +5804,21 @@ bool UnitTestImpl::RunAllTests() { // How many times to repeat the tests? We don't want to repeat them // when we are inside the subprocess of a death test. - const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat); + const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat); + // Repeats forever if the repeat count is negative. const bool gtest_repeat_forever = repeat < 0; + + // Should test environments be set up and torn down for each repeat, or only + // set up on the first and torn down on the last iteration? If there is no + // "last" iteration because the tests will repeat forever, always recreate the + // environments to avoid leaks in case one of the environments is using + // resources that are external to this process. Without this check there would + // be no way to clean up those external resources automatically. + const bool recreate_environments_when_repeating = + GTEST_FLAG_GET(recreate_environments_when_repeating) || + gtest_repeat_forever; + for (int i = 0; gtest_repeat_forever || i != repeat; i++) { // We want to preserve failures generated by ad-hoc test // assertions executed before RUN_ALL_TESTS(). @@ -5815,7 +5827,7 @@ bool UnitTestImpl::RunAllTests() { Timer timer; // Shuffles test suites and tests if requested. - if (has_tests_to_run && GTEST_FLAG(shuffle)) { + if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) { random()->Reseed(static_cast<uint32_t>(random_seed_)); // This should be done before calling OnTestIterationStart(), // such that a test event listener can see the actual test order @@ -5828,10 +5840,13 @@ bool UnitTestImpl::RunAllTests() { // Runs each test suite if there is at least one test to run. if (has_tests_to_run) { - // Sets up all environments beforehand. - repeater->OnEnvironmentsSetUpStart(*parent_); - ForEach(environments_, SetUpEnvironment); - repeater->OnEnvironmentsSetUpEnd(*parent_); + // Sets up all environments beforehand. If test environments aren't + // recreated for each iteration, only do so on the first iteration. + if (i == 0 || recreate_environments_when_repeating) { + repeater->OnEnvironmentsSetUpStart(*parent_); + ForEach(environments_, SetUpEnvironment); + repeater->OnEnvironmentsSetUpEnd(*parent_); + } // Runs the tests only if there was no fatal failure or skip triggered // during global set-up. @@ -5853,7 +5868,7 @@ bool UnitTestImpl::RunAllTests() { for (int test_index = 0; test_index < total_test_suite_count(); test_index++) { GetMutableSuiteCase(test_index)->Run(); - if (GTEST_FLAG(fail_fast) && + if (GTEST_FLAG_GET(fail_fast) && GetMutableSuiteCase(test_index)->Failed()) { for (int j = test_index + 1; j < total_test_suite_count(); j++) { GetMutableSuiteCase(j)->Skip(); @@ -5871,11 +5886,15 @@ bool UnitTestImpl::RunAllTests() { } } - // Tears down all environments in reverse order afterwards. - repeater->OnEnvironmentsTearDownStart(*parent_); - std::for_each(environments_.rbegin(), environments_.rend(), - TearDownEnvironment); - repeater->OnEnvironmentsTearDownEnd(*parent_); + // Tears down all environments in reverse order afterwards. If test + // environments aren't recreated for each iteration, only do so on the + // last iteration. + if (i == repeat - 1 || recreate_environments_when_repeating) { + repeater->OnEnvironmentsTearDownStart(*parent_); + std::for_each(environments_.rbegin(), environments_.rend(), + TearDownEnvironment); + repeater->OnEnvironmentsTearDownEnd(*parent_); + } } elapsed_time_ = timer.Elapsed(); @@ -5896,7 +5915,7 @@ bool UnitTestImpl::RunAllTests() { // (it's always safe to unshuffle the tests). UnshuffleTests(); - if (GTEST_FLAG(shuffle)) { + if (GTEST_FLAG_GET(shuffle)) { // Picks a new random seed for each iteration. random_seed_ = GetNextRandomSeed(random_seed_); } @@ -5947,8 +5966,7 @@ void WriteToShardStatusFileIfNeeded() { // an error and exits. If in_subprocess_for_death_test, sharding is // disabled because it must only be applied to the original test // process. Otherwise, we could filter out death tests we intended to execute. -bool ShouldShard(const char* total_shards_env, - const char* shard_index_env, +bool ShouldShard(const char* total_shards_env, const char* shard_index_env, bool in_subprocess_for_death_test) { if (in_subprocess_for_death_test) { return false; @@ -5960,27 +5978,27 @@ bool ShouldShard(const char* total_shards_env, if (total_shards == -1 && shard_index == -1) { return false; } else if (total_shards == -1 && shard_index != -1) { - const Message msg = Message() - << "Invalid environment variables: you have " - << kTestShardIndex << " = " << shard_index - << ", but have left " << kTestTotalShards << " unset.\n"; + const Message msg = Message() << "Invalid environment variables: you have " + << kTestShardIndex << " = " << shard_index + << ", but have left " << kTestTotalShards + << " unset.\n"; ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); } else if (total_shards != -1 && shard_index == -1) { const Message msg = Message() - << "Invalid environment variables: you have " - << kTestTotalShards << " = " << total_shards - << ", but have left " << kTestShardIndex << " unset.\n"; + << "Invalid environment variables: you have " + << kTestTotalShards << " = " << total_shards + << ", but have left " << kTestShardIndex << " unset.\n"; ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); } else if (shard_index < 0 || shard_index >= total_shards) { - const Message msg = Message() - << "Invalid environment variables: we require 0 <= " - << kTestShardIndex << " < " << kTestTotalShards - << ", but you have " << kTestShardIndex << "=" << shard_index - << ", " << kTestTotalShards << "=" << total_shards << ".\n"; + const Message msg = + Message() << "Invalid environment variables: we require 0 <= " + << kTestShardIndex << " < " << kTestTotalShards + << ", but you have " << kTestShardIndex << "=" << shard_index + << ", " << kTestTotalShards << "=" << total_shards << ".\n"; ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); fflush(stdout); exit(EXIT_FAILURE); @@ -6022,11 +6040,16 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { // https://github.com/google/googletest/blob/master/googletest/docs/advanced.md // . Returns the number of tests that should run. int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { - const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ? - Int32FromEnvOrDie(kTestTotalShards, -1) : -1; - const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ? - Int32FromEnvOrDie(kTestShardIndex, -1) : -1; - + const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL + ? Int32FromEnvOrDie(kTestTotalShards, -1) + : -1; + const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL + ? Int32FromEnvOrDie(kTestShardIndex, -1) + : -1; + + const PositiveAndNegativeUnitTestFilter gtest_flag_filter( + GTEST_FLAG_GET(filter)); + const UnitTestFilter disable_test_filter(kDisableTestFilter); // num_runnable_tests are the number of tests that will // run across all shards (i.e., match filter and are not disabled). // num_selected_tests are the number of tests to be run on @@ -6042,18 +6065,17 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { const std::string test_name(test_info->name()); // A test is disabled if test suite name or test name matches // kDisableTestFilter. - const bool is_disabled = internal::UnitTestOptions::MatchesFilter( - test_suite_name, kDisableTestFilter) || - internal::UnitTestOptions::MatchesFilter( - test_name, kDisableTestFilter); + const bool is_disabled = + disable_test_filter.MatchesName(test_suite_name) || + disable_test_filter.MatchesName(test_name); test_info->is_disabled_ = is_disabled; - const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest( - test_suite_name, test_name); + const bool matches_filter = + gtest_flag_filter.MatchesTest(test_suite_name, test_name); test_info->matches_filter_ = matches_filter; const bool is_runnable = - (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) && + (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) && matches_filter; const bool is_in_another_shard = @@ -6222,8 +6244,8 @@ void UnitTestImpl::UnshuffleTests() { // For example, if Foo() calls Bar(), which in turn calls // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. -std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, - int skip_count) { +GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string +GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) { // We pass skip_count + 1 to skip this wrapper function in addition // to what the user really wants to skip. return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1); @@ -6233,7 +6255,7 @@ std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, // suppress unreachable code warnings. namespace { class ClassUniqueToAlwaysTrue {}; -} +} // namespace bool IsTrue(bool condition) { return condition; } @@ -6241,8 +6263,7 @@ bool AlwaysTrue() { #if GTEST_HAS_EXCEPTIONS // This condition is always false so AlwaysTrue() never actually throws, // but it makes the compiler think that it may throw. - if (IsTrue(false)) - throw ClassUniqueToAlwaysTrue(); + if (IsTrue(false)) throw ClassUniqueToAlwaysTrue(); #endif // GTEST_HAS_EXCEPTIONS return true; } @@ -6264,13 +6285,14 @@ bool SkipPrefix(const char* prefix, const char** pstr) { // part can be omitted. // // Returns the value of the flag, or NULL if the parsing failed. -static const char* ParseFlagValue(const char* str, const char* flag, +static const char* ParseFlagValue(const char* str, const char* flag_name, bool def_optional) { // str and flag must not be NULL. - if (str == nullptr || flag == nullptr) return nullptr; + if (str == nullptr || flag_name == nullptr) return nullptr; // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. - const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag; + const std::string flag_str = + std::string("--") + GTEST_FLAG_PREFIX_ + flag_name; const size_t flag_len = flag_str.length(); if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr; @@ -6301,9 +6323,9 @@ static const char* ParseFlagValue(const char* str, const char* flag, // // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. -static bool ParseBoolFlag(const char* str, const char* flag, bool* value) { +static bool ParseFlag(const char* str, const char* flag_name, bool* value) { // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, true); + const char* const value_str = ParseFlagValue(str, flag_name, true); // Aborts if the parsing failed. if (value_str == nullptr) return false; @@ -6317,16 +6339,16 @@ static bool ParseBoolFlag(const char* str, const char* flag, bool* value) { // // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. -bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) { +bool ParseFlag(const char* str, const char* flag_name, int32_t* value) { // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); + const char* const value_str = ParseFlagValue(str, flag_name, false); // Aborts if the parsing failed. if (value_str == nullptr) return false; // Sets *value to the value of the flag. - return ParseInt32(Message() << "The value of flag --" << flag, - value_str, value); + return ParseInt32(Message() << "The value of flag --" << flag_name, value_str, + value); } // Parses a string for a string flag, in the form of "--flag=value". @@ -6334,9 +6356,9 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) { // On success, stores the value of the flag in *value, and returns // true. On failure, returns false without changing *value. template <typename String> -static bool ParseStringFlag(const char* str, const char* flag, String* value) { +static bool ParseFlag(const char* str, const char* flag_name, String* value) { // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); + const char* const value_str = ParseFlagValue(str, flag_name, false); // Aborts if the parsing failed. if (value_str == nullptr) return false; @@ -6353,8 +6375,7 @@ static bool ParseStringFlag(const char* str, const char* flag, String* value) { // GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test // internal flags and do not trigger the help message. static bool HasGoogleTestFlagPrefix(const char* str) { - return (SkipPrefix("--", &str) || - SkipPrefix("-", &str) || + return (SkipPrefix("--", &str) || SkipPrefix("-", &str) || SkipPrefix("/", &str)) && !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) && (SkipPrefix(GTEST_FLAG_PREFIX_, &str) || @@ -6437,6 +6458,10 @@ static const char kColorEncodedHelpMessage[] = "random_seed=@Y[NUMBER]@D\n" " Random number seed to use for shuffling test orders (between 1 and\n" " 99999, or 0 to use a seed based on the current time).\n" + " @G--" GTEST_FLAG_PREFIX_ + "recreate_environments_when_repeating@D\n" + " Sets up and tears down the global test environment on each repeat\n" + " of the test.\n" "\n" "Test Output:\n" " @G--" GTEST_FLAG_PREFIX_ @@ -6454,18 +6479,18 @@ static const char kColorEncodedHelpMessage[] = " Generate a JSON or XML report in the given directory or with the " "given\n" " file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n" -# if GTEST_CAN_STREAM_RESULTS_ +#if GTEST_CAN_STREAM_RESULTS_ " @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n" " Stream test results to the given server.\n" -# endif // GTEST_CAN_STREAM_RESULTS_ +#endif // GTEST_CAN_STREAM_RESULTS_ "\n" "Assertion Behavior:\n" -# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS " @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" " Set the default death test style.\n" -# endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS " @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n" " Turn assertion failures into debugger break-points.\n" @@ -6497,41 +6522,44 @@ static const char kColorEncodedHelpMessage[] = "@G<" GTEST_DEV_EMAIL_ ">@D.\n"; static bool ParseGoogleTestFlag(const char* const arg) { - return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag, - >EST_FLAG(also_run_disabled_tests)) || - ParseBoolFlag(arg, kBreakOnFailureFlag, - >EST_FLAG(break_on_failure)) || - ParseBoolFlag(arg, kCatchExceptionsFlag, - >EST_FLAG(catch_exceptions)) || - ParseStringFlag(arg, kColorFlag, >EST_FLAG(color)) || - ParseStringFlag(arg, kDeathTestStyleFlag, - >EST_FLAG(death_test_style)) || - ParseBoolFlag(arg, kDeathTestUseFork, - >EST_FLAG(death_test_use_fork)) || - ParseBoolFlag(arg, kFailFast, >EST_FLAG(fail_fast)) || - ParseStringFlag(arg, kFilterFlag, >EST_FLAG(filter)) || - ParseStringFlag(arg, kInternalRunDeathTestFlag, - >EST_FLAG(internal_run_death_test)) || - ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || - ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || - ParseBoolFlag(arg, kBriefFlag, >EST_FLAG(brief)) || - ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || - ParseBoolFlag(arg, kPrintUTF8Flag, >EST_FLAG(print_utf8)) || - ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || - ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || - ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || - ParseInt32Flag(arg, kStackTraceDepthFlag, - >EST_FLAG(stack_trace_depth)) || - ParseStringFlag(arg, kStreamResultToFlag, - >EST_FLAG(stream_result_to)) || - ParseBoolFlag(arg, kThrowOnFailureFlag, >EST_FLAG(throw_on_failure)); +#define GTEST_INTERNAL_PARSE_FLAG(flag_name) \ + do { \ + auto value = GTEST_FLAG_GET(flag_name); \ + if (ParseFlag(arg, #flag_name, &value)) { \ + GTEST_FLAG_SET(flag_name, value); \ + return true; \ + } \ + } while (false) + + GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests); + GTEST_INTERNAL_PARSE_FLAG(break_on_failure); + GTEST_INTERNAL_PARSE_FLAG(catch_exceptions); + GTEST_INTERNAL_PARSE_FLAG(color); + GTEST_INTERNAL_PARSE_FLAG(death_test_style); + GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork); + GTEST_INTERNAL_PARSE_FLAG(fail_fast); + GTEST_INTERNAL_PARSE_FLAG(filter); + GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test); + GTEST_INTERNAL_PARSE_FLAG(list_tests); + GTEST_INTERNAL_PARSE_FLAG(output); + GTEST_INTERNAL_PARSE_FLAG(brief); + GTEST_INTERNAL_PARSE_FLAG(print_time); + GTEST_INTERNAL_PARSE_FLAG(print_utf8); + GTEST_INTERNAL_PARSE_FLAG(random_seed); + GTEST_INTERNAL_PARSE_FLAG(repeat); + GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating); + GTEST_INTERNAL_PARSE_FLAG(shuffle); + GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth); + GTEST_INTERNAL_PARSE_FLAG(stream_result_to); + GTEST_INTERNAL_PARSE_FLAG(throw_on_failure); + return false; } #if GTEST_USE_OWN_FLAGFILE_FLAG_ static void LoadFlagsFromFile(const std::string& path) { FILE* flagfile = posix::FOpen(path.c_str(), "r"); if (!flagfile) { - GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile) + GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile) << "\""; } std::string contents(ReadEntireFile(flagfile)); @@ -6539,10 +6567,8 @@ static void LoadFlagsFromFile(const std::string& path) { std::vector<std::string> lines; SplitString(contents, '\n', &lines); for (size_t i = 0; i < lines.size(); ++i) { - if (lines[i].empty()) - continue; - if (!ParseGoogleTestFlag(lines[i].c_str())) - g_help_flag = true; + if (lines[i].empty()) continue; + if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true; } } #endif // GTEST_USE_OWN_FLAGFILE_FLAG_ @@ -6552,25 +6578,23 @@ static void LoadFlagsFromFile(const std::string& path) { // instantiated to either char or wchar_t. template <typename CharType> void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { + std::string flagfile_value; for (int i = 1; i < *argc; i++) { const std::string arg_string = StreamableToString(argv[i]); const char* const arg = arg_string.c_str(); - using internal::ParseBoolFlag; - using internal::ParseInt32Flag; - using internal::ParseStringFlag; + using internal::ParseFlag; bool remove_flag = false; if (ParseGoogleTestFlag(arg)) { remove_flag = true; #if GTEST_USE_OWN_FLAGFILE_FLAG_ - } else if (ParseStringFlag(arg, kFlagfileFlag, >EST_FLAG(flagfile))) { - LoadFlagsFromFile(GTEST_FLAG(flagfile)); + } else if (ParseFlag(arg, "flagfile", &flagfile_value)) { + GTEST_FLAG_SET(flagfile, flagfile_value); + LoadFlagsFromFile(flagfile_value); remove_flag = true; #endif // GTEST_USE_OWN_FLAGFILE_FLAG_ - } else if (arg_string == "--help" || arg_string == "-h" || - arg_string == "-?" || arg_string == "/?" || - HasGoogleTestFlagPrefix(arg)) { + } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) { // Both help flag and unrecognized Google Test flags (excluding // internal ones) trigger help display. g_help_flag = true; @@ -6605,7 +6629,27 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { // Parses the command line for Google Test flags, without initializing // other parts of Google Test. void ParseGoogleTestFlagsOnly(int* argc, char** argv) { +#if GTEST_HAS_ABSL + if (*argc > 0) { + // absl::ParseCommandLine() requires *argc > 0. + auto positional_args = absl::flags_internal::ParseCommandLineImpl( + *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs, + absl::flags_internal::UsageFlagsAction::kHandleUsage, + absl::flags_internal::OnUndefinedFlag::kReportUndefined); + // Any command-line positional arguments not part of any command-line flag + // (or arguments to a flag) are copied back out to argv, with the program + // invocation name at position 0, and argc is resized. This includes + // positional arguments after the flag-terminating delimiter '--'. + // See https://abseil.io/docs/cpp/guides/flags. + std::copy(positional_args.begin(), positional_args.end(), argv); + if (static_cast<int>(positional_args.size()) < *argc) { + argv[positional_args.size()] = nullptr; + *argc = static_cast<int>(positional_args.size()); + } + } +#else ParseGoogleTestFlagsOnlyImpl(argc, argv); +#endif // Fix the value of *_NSGetArgc() on macOS, but if and only if // *_NSGetArgv() == argv @@ -6640,6 +6684,12 @@ void InitGoogleTestImpl(int* argc, CharType** argv) { #if GTEST_HAS_ABSL absl::InitializeSymbolizer(g_argvs[0].c_str()); + + // When using the Abseil Flags library, set the program usage message to the + // help message, but remove the color-encoding from the message first. + absl::SetProgramUsageMessage(absl::StrReplaceAll( + kColorEncodedHelpMessage, + {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}})); #endif // GTEST_HAS_ABSL ParseGoogleTestFlagsOnly(argc, argv); @@ -6660,7 +6710,7 @@ void InitGoogleTestImpl(int* argc, CharType** argv) { void InitGoogleTest(int* argc, char** argv) { #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); -#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) internal::InitGoogleTestImpl(argc, argv); #endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) } @@ -6670,7 +6720,7 @@ void InitGoogleTest(int* argc, char** argv) { void InitGoogleTest(int* argc, wchar_t** argv) { #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); -#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) internal::InitGoogleTestImpl(argc, argv); #endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) } @@ -6686,42 +6736,42 @@ void InitGoogleTest() { #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv); -#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) internal::InitGoogleTestImpl(&argc, argv); #endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) } +#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) +// Return value of first environment variable that is set and contains +// a non-empty string. If there are none, return the "fallback" string. +// Since we like the temporary directory to have a directory separator suffix, +// add it if not provided in the environment variable value. +static std::string GetTempDirFromEnv( + std::initializer_list<const char*> environment_variables, + const char* fallback, char separator) { + for (const char* variable_name : environment_variables) { + const char* value = internal::posix::GetEnv(variable_name); + if (value != nullptr && value[0] != '\0') { + if (value[strlen(value) - 1] != separator) { + return std::string(value).append(1, separator); + } + return value; + } + } + return fallback; +} +#endif + std::string TempDir() { #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) return GTEST_CUSTOM_TEMPDIR_FUNCTION_(); -#elif GTEST_OS_WINDOWS_MOBILE - return "\\temp\\"; -#elif GTEST_OS_WINDOWS - const char* temp_dir = internal::posix::GetEnv("TEMP"); - if (temp_dir == nullptr || temp_dir[0] == '\0') { - return "\\temp\\"; - } else if (temp_dir[strlen(temp_dir) - 1] == '\\') { - return temp_dir; - } else { - return std::string(temp_dir) + "\\"; - } +#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE + return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\'); #elif GTEST_OS_LINUX_ANDROID - const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR"); - if (temp_dir == nullptr || temp_dir[0] == '\0') { - return "/data/local/tmp/"; - } else { - return temp_dir; - } -#elif GTEST_OS_LINUX - const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR"); - if (temp_dir == nullptr || temp_dir[0] == '\0') { - return "/tmp/"; - } else { - return temp_dir; - } + return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/'); #else - return "/tmp/"; -#endif // GTEST_OS_WINDOWS_MOBILE + return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/'); +#endif } // Class ScopedTrace @@ -6738,8 +6788,7 @@ void ScopedTrace::PushTrace(const char* file, int line, std::string message) { } // Pops the info pushed by the c'tor. -ScopedTrace::~ScopedTrace() - GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { +ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { UnitTest::GetInstance()->PopGTestTrace(); } diff --git a/libvpx/third_party/googletest/src/src/gtest_main.cc b/libvpx/third_party/googletest/src/src/gtest_main.cc index 46b27c3d7..44976375c 100644 --- a/libvpx/third_party/googletest/src/src/gtest_main.cc +++ b/libvpx/third_party/googletest/src/src/gtest_main.cc @@ -28,15 +28,14 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include <cstdio> + #include "gtest/gtest.h" #if GTEST_OS_ESP8266 || GTEST_OS_ESP32 #if GTEST_OS_ESP8266 extern "C" { #endif -void setup() { - testing::InitGoogleTest(); -} +void setup() { testing::InitGoogleTest(); } void loop() { RUN_ALL_TESTS(); } diff --git a/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py b/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py index 2d6a4d811..d763de856 100644 --- a/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py +++ b/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py @@ -83,7 +83,7 @@ class ExhaustNeighbor(MotionEST): self.beta = beta self.metric = metric super(ExhaustNeighbor, self).__init__(cur_f, ref_f, blk_size) - self.assign = np.zeros((self.num_row, self.num_col), dtype=np.bool) + self.assign = np.zeros((self.num_row, self.num_col), dtype=bool) """ estimate neighbor loss: diff --git a/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py b/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py index 12bc53ff7..37305898a 100644 --- a/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py +++ b/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py @@ -29,7 +29,7 @@ class GroundTruth(MotionEST): def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None): self.name = 'ground truth' super(GroundTruth, self).__init__(cur_f, ref_f, blk_sz) - self.mask = np.zeros((self.num_row, self.num_col), dtype=np.bool) + self.mask = np.zeros((self.num_row, self.num_col), dtype=bool) if gt_path: with open(gt_path) as gt_file: lines = gt_file.readlines() @@ -42,7 +42,7 @@ class GroundTruth(MotionEST): self.mask[i, -j - 1] = True continue #the order of original file is flipped on the x axis - self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=np.int) + self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=int) else: self.mf = mf self.mask = mask diff --git a/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py b/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py index 0959530fa..fc393818d 100644 --- a/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py +++ b/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py @@ -28,8 +28,8 @@ class MotionEST(object): self.ref_f = ref_f self.blk_sz = blk_sz #convert RGB to YUV - self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=np.int) - self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=np.int) + self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=int) + self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=int) #frame size self.width = self.cur_f.size[0] self.height = self.cur_f.size[1] diff --git a/libvpx/tools/3D-Reconstruction/MotionEST/Util.py b/libvpx/tools/3D-Reconstruction/MotionEST/Util.py index 551881cfd..c2416163b 100644 --- a/libvpx/tools/3D-Reconstruction/MotionEST/Util.py +++ b/libvpx/tools/3D-Reconstruction/MotionEST/Util.py @@ -18,7 +18,7 @@ from PIL import Image, ImageDraw def MSE(blk1, blk2): return np.mean( LA.norm( - np.array(blk1, dtype=np.int) - np.array(blk2, dtype=np.int), axis=2)) + np.array(blk1, dtype=int) - np.array(blk2, dtype=int), axis=2)) def drawMF(img, blk_sz, mf): diff --git a/libvpx/vp8/common/findnearmv.c b/libvpx/vp8/common/findnearmv.c index 6889fdedd..3b3192362 100644 --- a/libvpx/vp8/common/findnearmv.c +++ b/libvpx/vp8/common/findnearmv.c @@ -105,9 +105,9 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, tmp = near_mv_ref_cnts[CNT_NEAREST]; near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR]; near_mv_ref_cnts[CNT_NEAR] = tmp; - tmp = near_mvs[CNT_NEAREST].as_int; + tmp = (int)near_mvs[CNT_NEAREST].as_int; near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; - near_mvs[CNT_NEAR].as_int = tmp; + near_mvs[CNT_NEAR].as_int = (uint32_t)tmp; } /* Use near_mvs[0] to store the "best" MV */ diff --git a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c index e46827b0e..b9da52084 100644 --- a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c +++ b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c @@ -816,8 +816,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) - : [src_pixels_per_line] "r"(src_pixels_per_line), - [output_ptr] "r"(output_ptr)); + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); __asm__ __volatile__( "ulw %[Temp1], 0(%[src_ptr]) \n\t" @@ -832,8 +832,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) - : [src_pixels_per_line] "r"(src_pixels_per_line), - [output_ptr] "r"(output_ptr)); + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); __asm__ __volatile__( "ulw %[Temp1], 0(%[src_ptr]) \n\t" @@ -848,8 +848,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) - : [src_pixels_per_line] "r"(src_pixels_per_line), - [output_ptr] "r"(output_ptr)); + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); output_ptr += 48; } diff --git a/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/libvpx/vp8/common/mips/msa/vp8_macros_msa.h index ddc881a7f..7cb3c9869 100644 --- a/libvpx/vp8/common/mips/msa/vp8_macros_msa.h +++ b/libvpx/vp8/common/mips/msa/vp8_macros_msa.h @@ -69,12 +69,12 @@ #else // !(__mips == 64) #define LD(psrc) \ ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + const uint8_t *psrc_ld = (const uint8_t *)(psrc); \ uint32_t val0_m, val1_m; \ uint64_t val_m = 0; \ \ - val0_m = LW(psrc_m); \ - val1_m = LW(psrc_m + 4); \ + val0_m = LW(psrc_ld); \ + val1_m = LW(psrc_ld + 4); \ \ val_m = (uint64_t)(val1_m); \ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ @@ -122,10 +122,11 @@ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ uint32_t val_m; \ \ - asm volatile("lwr %[val_m], 0(%[psrc_m]) \n\t" \ - "lwl %[val_m], 3(%[psrc_m]) \n\t" \ - : [val_m] "=&r"(val_m) \ - : [psrc_m] "r"(psrc_m)); \ + asm volatile( \ + "lwr %[val_m], 0(%[psrc_m]) \n\t" \ + "lwl %[val_m], 3(%[psrc_m]) \n\t" \ + : [val_m] "=&r"(val_m) \ + : [psrc_m] "r"(psrc_m)); \ \ val_m; \ }) @@ -136,10 +137,11 @@ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ uint64_t val_m = 0; \ \ - asm volatile("ldr %[val_m], 0(%[psrc_m]) \n\t" \ - "ldl %[val_m], 7(%[psrc_m]) \n\t" \ - : [val_m] "=&r"(val_m) \ - : [psrc_m] "r"(psrc_m)); \ + asm volatile( \ + "ldr %[val_m], 0(%[psrc_m]) \n\t" \ + "ldl %[val_m], 7(%[psrc_m]) \n\t" \ + : [val_m] "=&r"(val_m) \ + : [psrc_m] "r"(psrc_m)); \ \ val_m; \ }) diff --git a/libvpx/vp8/decoder/dboolhuff.h b/libvpx/vp8/decoder/dboolhuff.h index f2a18f0d9..673b2fbd5 100644 --- a/libvpx/vp8/decoder/dboolhuff.h +++ b/libvpx/vp8/decoder/dboolhuff.h @@ -15,6 +15,7 @@ #include <limits.h> #include "./vpx_config.h" +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/mem.h" #include "vpx/vp8dx.h" #include "vpx/vpx_integer.h" @@ -50,7 +51,8 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, void vp8dx_bool_decoder_fill(BOOL_DECODER *br); -static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) { +static VPX_NO_UNSIGNED_SHIFT_CHECK int vp8dx_decode_bool(BOOL_DECODER *br, + int probability) { unsigned int bit = 0; VP8_BD_VALUE value; unsigned int split; diff --git a/libvpx/vp8/decoder/decodemv.c b/libvpx/vp8/decoder/decodemv.c index 51817a2cb..3f459d623 100644 --- a/libvpx/vp8/decoder/decodemv.c +++ b/libvpx/vp8/decoder/decodemv.c @@ -372,9 +372,9 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, tmp = cnt[CNT_NEAREST]; cnt[CNT_NEAREST] = cnt[CNT_NEAR]; cnt[CNT_NEAR] = tmp; - tmp = near_mvs[CNT_NEAREST].as_int; + tmp = (int)near_mvs[CNT_NEAREST].as_int; near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; - near_mvs[CNT_NEAR].as_int = tmp; + near_mvs[CNT_NEAR].as_int = (uint32_t)tmp; } if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) { diff --git a/libvpx/vp8/decoder/onyxd_int.h b/libvpx/vp8/decoder/onyxd_int.h index cf2c066d9..a6bedc4fa 100644 --- a/libvpx/vp8/decoder/onyxd_int.h +++ b/libvpx/vp8/decoder/onyxd_int.h @@ -11,6 +11,8 @@ #ifndef VPX_VP8_DECODER_ONYXD_INT_H_ #define VPX_VP8_DECODER_ONYXD_INT_H_ +#include <assert.h> + #include "vpx_config.h" #include "vp8/common/onyxd.h" #include "treereader.h" @@ -136,6 +138,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb); #if CONFIG_DEBUG #define CHECK_MEM_ERROR(lval, expr) \ do { \ + assert(pbi->common.error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ @@ -145,6 +148,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb); #else #define CHECK_MEM_ERROR(lval, expr) \ do { \ + assert(pbi->common.error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c index 0e97af5f2..190b013af 100644 --- a/libvpx/vp8/encoder/bitstream.c +++ b/libvpx/vp8/encoder/bitstream.c @@ -19,6 +19,7 @@ #include <limits.h> #include "vpx/vpx_encoder.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/system_state.h" #include "bitstream.h" @@ -117,7 +118,9 @@ static void write_split(vp8_writer *bc, int x) { vp8_mbsplit_encodings + x); } -void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) { +void VPX_NO_UNSIGNED_SHIFT_CHECK vp8_pack_tokens(vp8_writer *w, + const TOKENEXTRA *p, + int xcount) { const TOKENEXTRA *stop = p + xcount; unsigned int split; int shift; diff --git a/libvpx/vp8/encoder/encodemv.c b/libvpx/vp8/encoder/encodemv.c index c88ea1653..384bb2938 100644 --- a/libvpx/vp8/encoder/encodemv.c +++ b/libvpx/vp8/encoder/encodemv.c @@ -31,17 +31,15 @@ static void encode_mvcomponent(vp8_writer *const w, const int v, vp8_write(w, 1, p[mvpis_short]); - do + do { vp8_write(w, (x >> i) & 1, p[MVPbits + i]); - - while (++i < 3); + } while (++i < 3); i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */ - do + do { vp8_write(w, (x >> i) & 1, p[MVPbits + i]); - - while (--i > 3); + } while (--i > 3); if (x & 0xFFF0) vp8_write(w, (x >> 3) & 1, p[MVPbits + 3]); } diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c index ed177e3cb..65d2681c9 100644 --- a/libvpx/vp8/encoder/firstpass.c +++ b/libvpx/vp8/encoder/firstpass.c @@ -903,9 +903,9 @@ static double calc_correction_factor(double err_per_mb, double err_devisor, correction_factor = pow(error_term, power_term); /* Clip range */ - correction_factor = (correction_factor < 0.05) - ? 0.05 - : (correction_factor > 5.0) ? 5.0 : correction_factor; + correction_factor = (correction_factor < 0.05) ? 0.05 + : (correction_factor > 5.0) ? 5.0 + : correction_factor; return correction_factor; } @@ -947,11 +947,10 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, } cpi->twopass.est_max_qcorrection_factor = - (cpi->twopass.est_max_qcorrection_factor < 0.1) - ? 0.1 - : (cpi->twopass.est_max_qcorrection_factor > 10.0) - ? 10.0 - : cpi->twopass.est_max_qcorrection_factor; + (cpi->twopass.est_max_qcorrection_factor < 0.1) ? 0.1 + : (cpi->twopass.est_max_qcorrection_factor > 10.0) + ? 10.0 + : cpi->twopass.est_max_qcorrection_factor; } /* Corrections for higher compression speed settings @@ -1178,10 +1177,9 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, } else { current_spend_ratio = (double)cpi->long_rolling_actual_bits / (double)cpi->long_rolling_target_bits; - current_spend_ratio = - (current_spend_ratio > 10.0) - ? 10.0 - : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio; + current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 + : (current_spend_ratio < 0.1) ? 0.1 + : current_spend_ratio; } /* Calculate a correction factor based on the quality of prediction in @@ -1968,11 +1966,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } cpi->twopass.gf_group_bits = - (cpi->twopass.gf_group_bits < 0) - ? 0 - : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) - ? cpi->twopass.kf_group_bits - : cpi->twopass.gf_group_bits; + (cpi->twopass.gf_group_bits < 0) ? 0 + : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) + ? cpi->twopass.kf_group_bits + : cpi->twopass.gf_group_bits; /* Clip cpi->twopass.gf_group_bits based on user supplied data rate * variability limit (cpi->oxcf.two_pass_vbrmax_section) diff --git a/libvpx/vp8/encoder/loongarch/quantize_lsx.c b/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c index 75889192a..75889192a 100644 --- a/libvpx/vp8/encoder/loongarch/quantize_lsx.c +++ b/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c diff --git a/libvpx/vp8/encoder/mcomp.c b/libvpx/vp8/encoder/mcomp.c index ae092c66e..b92e2135e 100644 --- a/libvpx/vp8/encoder/mcomp.c +++ b/libvpx/vp8/encoder/mcomp.c @@ -204,20 +204,21 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) { /* returns distortion + motion vector cost */ #define ERR(r, c) (MVC(r, c) + DIST(r, c)) /* checks if (r,c) has better score than previous best */ -#define CHECK_BETTER(v, r, c) \ - do { \ - IFMVCV(r, c, \ - { \ - thismse = DIST(r, c); \ - if ((v = (MVC(r, c) + thismse)) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - }, \ - v = UINT_MAX;) \ +#define CHECK_BETTER(v, r, c) \ + do { \ + IFMVCV( \ + r, c, \ + { \ + thismse = DIST(r, c); \ + if ((v = (MVC(r, c) + thismse)) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + }, \ + v = UINT_MAX;) \ } while (0) int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c index ffb3867dd..4bbeadef0 100644 --- a/libvpx/vp8/encoder/onyx_if.c +++ b/libvpx/vp8/encoder/onyx_if.c @@ -328,8 +328,8 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, // for any "new" layers. For "existing" layers, let them inherit the parameters // from the previous layer state (at the same layer #). In future we may want // to better map the previous layer state(s) to the "new" ones. -static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, - const int prev_num_layers) { +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, + const int prev_num_layers) { int i; double prev_layer_framerate = 0; const int curr_num_layers = cpi->oxcf.number_of_layers; @@ -1643,7 +1643,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cpi->temporal_layer_id = 0; } cpi->temporal_pattern_counter = 0; - reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers); + vp8_reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers); } if (!cpi->initial_width) { @@ -4202,11 +4202,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } /* Clamp cpi->zbin_over_quant */ - cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low) - ? zbin_oq_low - : (cpi->mb.zbin_over_quant > zbin_oq_high) - ? zbin_oq_high - : cpi->mb.zbin_over_quant; + cpi->mb.zbin_over_quant = + (cpi->mb.zbin_over_quant < zbin_oq_low) ? zbin_oq_low + : (cpi->mb.zbin_over_quant > zbin_oq_high) ? zbin_oq_high + : cpi->mb.zbin_over_quant; Loop = Q != last_q; } else { diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h index 424f51b18..46a17913a 100644 --- a/libvpx/vp8/encoder/onyx_int.h +++ b/libvpx/vp8/encoder/onyx_int.h @@ -11,7 +11,9 @@ #ifndef VPX_VP8_ENCODER_ONYX_INT_H_ #define VPX_VP8_ENCODER_ONYX_INT_H_ +#include <assert.h> #include <stdio.h> + #include "vpx_config.h" #include "vp8/common/onyx.h" #include "treewriter.h" @@ -483,7 +485,7 @@ typedef struct VP8_COMP { unsigned char *segmentation_map; signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; - int segment_encode_breakout[MAX_MB_SEGMENTS]; + unsigned int segment_encode_breakout[MAX_MB_SEGMENTS]; unsigned char *active_map; unsigned int active_map_enabled; @@ -711,6 +713,8 @@ void vp8_initialize_enc(void); void vp8_alloc_compressor_data(VP8_COMP *cpi); int vp8_reverse_trans(int x); +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, + const int prev_num_layers); void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, const int layer, double prev_layer_framerate); @@ -730,6 +734,7 @@ void vp8_set_speed_features(VP8_COMP *cpi); #if CONFIG_DEBUG #define CHECK_MEM_ERROR(lval, expr) \ do { \ + assert(cpi->common.error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ @@ -739,6 +744,7 @@ void vp8_set_speed_features(VP8_COMP *cpi); #else #define CHECK_MEM_ERROR(lval, expr) \ do { \ + assert(cpi->common.error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c index 5821fc734..bbddacf8f 100644 --- a/libvpx/vp8/encoder/rdopt.c +++ b/libvpx/vp8/encoder/rdopt.c @@ -1608,7 +1608,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], RATE_DISTORTION *rd, unsigned int q2dc = xd->block[24].dequant[0]; /* If theres is no codeable 2nd order dc or a very small uniform pixel change change */ - if ((sse - var<q2dc * q2dc>> 4) || (sse / 2 > var && sse - var < 64)) { + if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) { /* Check u and v to make sure skip is ok */ unsigned int sse2 = VP8_UVSSE(x); if (sse2 * 2 < threshold) { diff --git a/libvpx/vp8/encoder/x86/denoising_sse2.c b/libvpx/vp8/encoder/x86/denoising_sse2.c index 89cad5335..f35b93016 100644 --- a/libvpx/vp8/encoder/x86/denoising_sse2.c +++ b/libvpx/vp8/encoder/x86/denoising_sse2.c @@ -30,7 +30,7 @@ static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) { _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); - unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba)); + unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba)); return sum_diff; } diff --git a/libvpx/vp8/encoder/x86/quantize_sse4.c b/libvpx/vp8/encoder/x86/quantize_sse4.c index 6d03365fc..4c2d24cc2 100644 --- a/libvpx/vp8/encoder/x86/quantize_sse4.c +++ b/libvpx/vp8/encoder/x86/quantize_sse4.c @@ -13,8 +13,11 @@ #include "./vp8_rtcd.h" #include "vp8/encoder/block.h" #include "vpx_ports/bitops.h" /* get_lsb */ +#include "vpx_ports/compiler_attributes.h" -void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { +// Unsigned shift overflow is disabled for the use of ~1U << eob with ymask. +VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b, + BLOCKD *d) { int eob = -1; short *zbin_boost_ptr = b->zrun_zbin_boost; __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr)); diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c index 6d88e5154..55a77ba7e 100644 --- a/libvpx/vp8/vp8_dx_iface.c +++ b/libvpx/vp8/vp8_dx_iface.c @@ -275,7 +275,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, void *user_priv, long deadline) { volatile vpx_codec_err_t res; volatile unsigned int resolution_change = 0; - unsigned int w, h; + volatile unsigned int w, h; if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) { return 0; diff --git a/libvpx/vp8/vp8_ratectrl_rtc.cc b/libvpx/vp8/vp8_ratectrl_rtc.cc index 2f23c5b1d..f3f42529d 100644 --- a/libvpx/vp8/vp8_ratectrl_rtc.cc +++ b/libvpx/vp8/vp8_ratectrl_rtc.cc @@ -92,6 +92,7 @@ void VP8RateControlRTC::UpdateRateControl( const VP8RateControlRtcConfig &rc_cfg) { VP8_COMMON *cm = &cpi_->common; VP8_CONFIG *oxcf = &cpi_->oxcf; + const unsigned int prev_number_of_layers = oxcf->number_of_layers; vpx_clear_system_state(); cm->Width = rc_cfg.width; cm->Height = rc_cfg.height; @@ -124,17 +125,33 @@ void VP8RateControlRTC::UpdateRateControl( static_cast<int>(cpi_->output_framerate); } - if (oxcf->number_of_layers > 1) { + if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) { memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate, sizeof(rc_cfg.layer_target_bitrate)); memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator, sizeof(rc_cfg.ts_rate_decimator)); - oxcf->periodicity = 2; + if (cm->current_video_frame == 0) { + double prev_layer_framerate = 0; + for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) { + vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate); + prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i]; + } + } else if (oxcf->number_of_layers != prev_number_of_layers) { + // The number of temporal layers has changed, so reset/initialize the + // temporal layer context for the new layer configuration: this means + // calling vp8_reset_temporal_layer_change() below. + + // Start at the base of the pattern cycle, so set the layer id to 0 and + // reset the temporal pattern counter. + // TODO(marpan/jianj): don't think lines 148-151 are needed (user controls + // the layer_id) so remove. + if (cpi_->temporal_layer_id > 0) { + cpi_->temporal_layer_id = 0; + } + cpi_->temporal_pattern_counter = 0; - double prev_layer_framerate = 0; - for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) { - vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate); - prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i]; + vp8_reset_temporal_layer_change(cpi_, oxcf, + static_cast<int>(prev_number_of_layers)); } } @@ -146,20 +163,24 @@ void VP8RateControlRTC::UpdateRateControl( cm->MBs = cm->mb_rows * cm->mb_cols; cm->mode_info_stride = cm->mb_cols + 1; - oxcf->starting_buffer_level = - rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000); - /* Set or reset optimal and maximum buffer levels. */ - if (oxcf->optimal_buffer_level == 0) { - oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8; - } else { - oxcf->optimal_buffer_level = - rescale((int)oxcf->optimal_buffer_level, oxcf->target_bandwidth, 1000); - } - if (oxcf->maximum_buffer_size == 0) { - oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8; - } else { - oxcf->maximum_buffer_size = - rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000); + // For temporal layers: starting/maximum/optimal_buffer_level is already set + // via vp8_init_temporal_layer_context() or vp8_reset_temporal_layer_change(). + if (oxcf->number_of_layers <= 1 && prev_number_of_layers <= 1) { + oxcf->starting_buffer_level = + rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000); + /* Set or reset optimal and maximum buffer levels. */ + if (oxcf->optimal_buffer_level == 0) { + oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8; + } else { + oxcf->optimal_buffer_level = rescale((int)oxcf->optimal_buffer_level, + oxcf->target_bandwidth, 1000); + } + if (oxcf->maximum_buffer_size == 0) { + oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8; + } else { + oxcf->maximum_buffer_size = + rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000); + } } if (cpi_->bits_off_target > oxcf->maximum_buffer_size) { diff --git a/libvpx/vp8/vp8cx.mk b/libvpx/vp8/vp8cx.mk index 5744cbabc..b4b3fda9e 100644 --- a/libvpx/vp8/vp8cx.mk +++ b/libvpx/vp8/vp8cx.mk @@ -125,8 +125,8 @@ VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c endif # common (loongarch LSX intrinsics) -VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/quantize_lsx.c VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/vp8_quantize_lsx.c VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes)) diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h index 3cec53bfd..8d2bed38e 100644 --- a/libvpx/vp9/common/vp9_common.h +++ b/libvpx/vp9/common/vp9_common.h @@ -49,6 +49,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { #if CONFIG_DEBUG #define CHECK_MEM_ERROR(cm, lval, expr) \ do { \ + assert(&(cm)->error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ @@ -58,6 +59,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { #else #define CHECK_MEM_ERROR(cm, lval, expr) \ do { \ + assert(&(cm)->error.setjmp); \ (lval) = (expr); \ if (!(lval)) \ vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index 95d6029f3..765cb1172 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -1180,7 +1180,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, } // Disable filtering on the leftmost column - border_mask = ~(mi_col == 0 ? 1 : 0); + border_mask = ~(mi_col == 0 ? 1u : 0u); #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { highbd_filter_selectively_vert( diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl index 4da0b6675..f4bd9772c 100644 --- a/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -129,10 +129,10 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_ add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64"; +specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64"; +specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_block_error avx2 sse2/; @@ -175,7 +175,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Motion search # add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; -specialize qw/vp9_diamond_search_sad avx/; +specialize qw/vp9_diamond_search_sad avx neon/; # # Apply temporal filter @@ -196,11 +196,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # ENCODEMB INVOKE add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_highbd_quantize_fp avx2 neon/; add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ; + specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/; # fdct functions add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; + specialize qw/vp9_highbd_fht4x4 neon/; add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; diff --git a/libvpx/vp9/common/vp9_scan.c b/libvpx/vp9/common/vp9_scan.c index 0fef26351..8bea61dea 100644 --- a/libvpx/vp9/common/vp9_scan.c +++ b/libvpx/vp9/common/vp9_scan.c @@ -511,180 +511,181 @@ DECLARE_ALIGNED(16, static const int16_t, 959, 990, 991, 1022, 0, 0, }; +// Add 1 to iscan values. This represents the EOB position instead of the index. DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = { - 0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15, + 1, 3, 6, 9, 2, 4, 10, 13, 5, 8, 12, 15, 7, 11, 14, 16, }; DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = { - 0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15, + 1, 4, 8, 12, 2, 6, 10, 13, 3, 7, 11, 15, 5, 9, 14, 16, }; DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = { - 0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15, + 1, 2, 4, 6, 3, 5, 7, 10, 8, 9, 12, 14, 11, 13, 15, 16, }; DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = { - 0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51, - 2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56, - 6, 12, 21, 27, 35, 43, 52, 58, 9, 17, 25, 33, 39, 48, 55, 60, - 14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63, + 1, 4, 9, 16, 23, 33, 41, 48, 2, 6, 12, 19, 27, 35, 45, 52, + 3, 8, 14, 21, 29, 39, 47, 55, 5, 11, 17, 25, 32, 42, 51, 57, + 7, 13, 22, 28, 36, 44, 53, 59, 10, 18, 26, 34, 40, 49, 56, 61, + 15, 24, 31, 38, 46, 54, 60, 63, 20, 30, 37, 43, 50, 58, 62, 64, }; DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = { - 0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39, - 6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52, - 18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59, - 32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63, + 1, 2, 3, 6, 9, 13, 20, 25, 4, 5, 8, 11, 16, 21, 31, 40, + 7, 10, 14, 17, 22, 28, 38, 47, 12, 15, 18, 24, 29, 35, 45, 53, + 19, 23, 26, 32, 36, 42, 51, 58, 27, 30, 34, 39, 44, 50, 56, 60, + 33, 37, 43, 48, 52, 55, 61, 62, 41, 46, 49, 54, 57, 59, 63, 64, }; DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = { - 0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44, - 3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53, - 12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60, - 25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63, + 1, 3, 6, 10, 15, 23, 32, 38, 2, 5, 9, 14, 20, 27, 39, 45, + 4, 7, 11, 18, 25, 31, 43, 50, 8, 12, 16, 22, 30, 37, 48, 54, + 13, 17, 21, 28, 35, 44, 53, 58, 19, 24, 29, 36, 42, 49, 57, 61, + 26, 33, 40, 46, 51, 56, 60, 63, 34, 41, 47, 52, 55, 59, 62, 64, }; DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = { - 0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198, - 1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212, - 2, 8, 16, 25, 38, 52, 67, 83, 101, 116, 136, 157, 172, 190, 205, 216, - 3, 10, 18, 29, 41, 55, 71, 89, 103, 119, 141, 159, 176, 194, 208, 218, - 5, 12, 21, 32, 45, 58, 74, 93, 104, 123, 144, 164, 179, 196, 210, 223, - 7, 15, 26, 37, 49, 63, 78, 96, 112, 129, 146, 166, 182, 200, 215, 228, - 9, 19, 28, 39, 54, 69, 86, 102, 117, 132, 151, 170, 187, 206, 220, 230, - 13, 24, 35, 46, 60, 73, 91, 108, 122, 137, 154, 174, 189, 207, 224, 235, - 17, 30, 40, 53, 66, 82, 98, 115, 126, 142, 161, 180, 197, 213, 227, 237, - 22, 36, 48, 62, 76, 92, 105, 120, 133, 147, 167, 186, 203, 219, 232, 240, - 27, 44, 56, 70, 84, 99, 113, 127, 140, 156, 175, 193, 209, 226, 236, 244, - 33, 51, 68, 79, 94, 110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247, - 42, 61, 77, 90, 106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251, - 50, 72, 87, 100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253, - 57, 80, 97, 111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254, - 65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255, + 1, 5, 12, 21, 32, 44, 60, 76, 86, 110, 131, 151, 166, 182, 196, 199, + 2, 7, 15, 24, 35, 48, 65, 82, 96, 115, 136, 154, 172, 189, 202, 213, + 3, 9, 17, 26, 39, 53, 68, 84, 102, 117, 137, 158, 173, 191, 206, 217, + 4, 11, 19, 30, 42, 56, 72, 90, 104, 120, 142, 160, 177, 195, 209, 219, + 6, 13, 22, 33, 46, 59, 75, 94, 105, 124, 145, 165, 180, 197, 211, 224, + 8, 16, 27, 38, 50, 64, 79, 97, 113, 130, 147, 167, 183, 201, 216, 229, + 10, 20, 29, 40, 55, 70, 87, 103, 118, 133, 152, 171, 188, 207, 221, 231, + 14, 25, 36, 47, 61, 74, 92, 109, 123, 138, 155, 175, 190, 208, 225, 236, + 18, 31, 41, 54, 67, 83, 99, 116, 127, 143, 162, 181, 198, 214, 228, 238, + 23, 37, 49, 63, 77, 93, 106, 121, 134, 148, 168, 187, 204, 220, 233, 241, + 28, 45, 57, 71, 85, 100, 114, 128, 141, 157, 176, 194, 210, 227, 237, 245, + 34, 52, 69, 80, 95, 111, 126, 139, 150, 163, 185, 203, 218, 230, 242, 248, + 43, 62, 78, 91, 107, 122, 135, 149, 161, 174, 192, 212, 226, 239, 246, 252, + 51, 73, 88, 101, 119, 129, 146, 159, 169, 184, 205, 223, 234, 243, 250, 254, + 58, 81, 98, 112, 132, 144, 156, 170, 179, 193, 215, 232, 240, 247, 251, 255, + 66, 89, 108, 125, 140, 153, 164, 178, 186, 200, 222, 235, 244, 249, 253, 256, }; DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = { - 0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76, - 86, 3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99, - 115, 130, 8, 10, 13, 18, 23, 27, 33, 42, 51, 60, 72, 88, 103, - 119, 142, 167, 14, 16, 20, 26, 31, 37, 44, 53, 61, 73, 85, 100, - 116, 135, 161, 185, 21, 24, 30, 35, 40, 47, 55, 65, 74, 81, 94, - 112, 133, 154, 179, 205, 28, 34, 39, 45, 50, 58, 67, 77, 87, 96, - 106, 121, 146, 169, 196, 212, 41, 46, 49, 56, 63, 70, 79, 90, 98, - 107, 122, 138, 159, 182, 207, 222, 52, 57, 62, 69, 75, 83, 93, 102, - 110, 120, 134, 150, 176, 195, 215, 226, 66, 71, 78, 82, 91, 97, 108, - 113, 127, 136, 148, 168, 188, 202, 221, 232, 80, 89, 92, 101, 105, 114, - 125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95, 104, 109, 117, 123, - 128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129, - 140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137, - 145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149, - 156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152, - 163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253, - 158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254, - 255, + 1, 2, 3, 5, 7, 10, 13, 18, 23, 30, 37, 44, 55, 65, 77, + 87, 4, 6, 8, 12, 16, 20, 26, 33, 39, 49, 60, 69, 85, 100, + 116, 131, 9, 11, 14, 19, 24, 28, 34, 43, 52, 61, 73, 89, 104, + 120, 143, 168, 15, 17, 21, 27, 32, 38, 45, 54, 62, 74, 86, 101, + 117, 136, 162, 186, 22, 25, 31, 36, 41, 48, 56, 66, 75, 82, 95, + 113, 134, 155, 180, 206, 29, 35, 40, 46, 51, 59, 68, 78, 88, 97, + 107, 122, 147, 170, 197, 213, 42, 47, 50, 57, 64, 71, 80, 91, 99, + 108, 123, 139, 160, 183, 208, 223, 53, 58, 63, 70, 76, 84, 94, 103, + 111, 121, 135, 151, 177, 196, 216, 227, 67, 72, 79, 83, 92, 98, 109, + 114, 128, 137, 149, 169, 189, 203, 222, 233, 81, 90, 93, 102, 106, 115, + 126, 132, 140, 152, 163, 178, 193, 209, 224, 235, 96, 105, 110, 118, 124, + 129, 144, 145, 156, 166, 176, 191, 207, 220, 234, 240, 112, 119, 125, 130, + 141, 148, 158, 165, 171, 182, 192, 204, 225, 231, 241, 244, 127, 133, 138, + 146, 154, 161, 175, 179, 185, 198, 205, 217, 232, 238, 245, 247, 142, 150, + 157, 167, 173, 181, 190, 200, 201, 211, 221, 229, 239, 243, 250, 252, 153, + 164, 172, 184, 187, 194, 202, 212, 215, 219, 228, 237, 246, 248, 253, 254, + 159, 174, 188, 195, 199, 210, 214, 218, 226, 230, 236, 242, 249, 251, 255, + 256, }; DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = { - 0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166, - 179, 1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154, - 178, 196, 3, 7, 11, 18, 25, 33, 46, 57, 71, 86, 101, 119, 148, - 164, 186, 201, 6, 12, 16, 23, 31, 39, 53, 64, 78, 92, 110, 127, - 153, 169, 193, 208, 10, 14, 19, 28, 37, 47, 58, 67, 84, 98, 114, - 133, 161, 176, 198, 214, 15, 21, 26, 34, 43, 52, 65, 77, 91, 106, - 120, 140, 165, 185, 205, 221, 22, 27, 32, 41, 48, 60, 73, 85, 99, - 116, 130, 151, 175, 190, 211, 225, 29, 35, 42, 49, 59, 69, 81, 95, - 108, 125, 139, 155, 182, 197, 217, 229, 38, 45, 51, 61, 68, 80, 93, - 105, 118, 134, 150, 168, 191, 207, 223, 234, 50, 56, 63, 74, 83, 94, - 109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62, 70, 76, 87, 97, - 107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75, 82, 90, 102, - 112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89, 100, 111, - 123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115, - 126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121, - 135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254, - 137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253, - 255, + 1, 3, 6, 10, 18, 25, 37, 45, 56, 73, 89, 105, 129, 144, 167, + 180, 2, 5, 9, 14, 21, 31, 41, 55, 67, 80, 97, 114, 142, 155, + 179, 197, 4, 8, 12, 19, 26, 34, 47, 58, 72, 87, 102, 120, 149, + 165, 187, 202, 7, 13, 17, 24, 32, 40, 54, 65, 79, 93, 111, 128, + 154, 170, 194, 209, 11, 15, 20, 29, 38, 48, 59, 68, 85, 99, 115, + 134, 162, 177, 199, 215, 16, 22, 27, 35, 44, 53, 66, 78, 92, 107, + 121, 141, 166, 186, 206, 222, 23, 28, 33, 42, 49, 61, 74, 86, 100, + 117, 131, 152, 176, 191, 212, 226, 30, 36, 43, 50, 60, 70, 82, 96, + 109, 126, 140, 156, 183, 198, 218, 230, 39, 46, 52, 62, 69, 81, 94, + 106, 119, 135, 151, 169, 192, 208, 224, 235, 51, 57, 64, 75, 84, 95, + 110, 118, 130, 148, 164, 178, 200, 214, 229, 239, 63, 71, 77, 88, 98, + 108, 123, 132, 146, 160, 173, 189, 211, 223, 236, 243, 76, 83, 91, 103, + 113, 125, 139, 147, 158, 174, 188, 203, 220, 231, 241, 246, 90, 101, 112, + 124, 133, 143, 157, 168, 181, 190, 204, 217, 232, 238, 247, 251, 104, 116, + 127, 137, 150, 163, 172, 184, 195, 205, 216, 225, 237, 242, 249, 253, 122, + 136, 145, 159, 171, 182, 193, 201, 210, 219, 228, 234, 244, 245, 252, 255, + 138, 153, 161, 175, 185, 196, 207, 213, 221, 227, 233, 240, 248, 250, 254, + 256, }; DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = { - 0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, - 170, 193, 204, 210, 219, 229, 233, 245, 257, 275, 299, 342, 356, - 377, 405, 455, 471, 495, 527, 1, 4, 8, 15, 22, 30, 45, - 58, 74, 92, 112, 133, 158, 184, 203, 215, 222, 228, 234, 237, - 256, 274, 298, 317, 355, 376, 404, 426, 470, 494, 526, 551, 3, - 7, 12, 18, 28, 36, 52, 64, 82, 102, 118, 142, 164, 189, - 208, 217, 224, 231, 235, 238, 273, 297, 316, 329, 375, 403, 425, - 440, 493, 525, 550, 567, 6, 11, 16, 23, 31, 43, 60, 73, - 90, 109, 126, 150, 173, 196, 211, 220, 226, 232, 236, 239, 296, - 315, 328, 335, 402, 424, 439, 447, 524, 549, 566, 575, 9, 14, - 19, 29, 37, 50, 65, 78, 95, 116, 134, 157, 179, 201, 214, - 223, 244, 255, 272, 295, 341, 354, 374, 401, 454, 469, 492, 523, - 582, 596, 617, 645, 13, 20, 26, 35, 44, 54, 72, 85, 105, - 123, 140, 163, 182, 205, 216, 225, 254, 271, 294, 314, 353, 373, - 400, 423, 468, 491, 522, 548, 595, 616, 644, 666, 21, 27, 33, - 42, 53, 63, 80, 94, 113, 132, 151, 172, 190, 209, 218, 227, - 270, 293, 313, 327, 372, 399, 422, 438, 490, 521, 547, 565, 615, - 643, 665, 680, 24, 32, 39, 48, 57, 71, 88, 104, 120, 139, - 159, 178, 197, 212, 221, 230, 292, 312, 326, 334, 398, 421, 437, - 446, 520, 546, 564, 574, 642, 664, 679, 687, 34, 40, 46, 56, - 68, 81, 96, 111, 130, 147, 167, 186, 243, 253, 269, 291, 340, - 352, 371, 397, 453, 467, 489, 519, 581, 594, 614, 641, 693, 705, - 723, 747, 41, 49, 55, 67, 77, 91, 107, 124, 138, 161, 177, - 194, 252, 268, 290, 311, 351, 370, 396, 420, 466, 488, 518, 545, - 593, 613, 640, 663, 704, 722, 746, 765, 51, 59, 66, 76, 89, - 99, 119, 131, 149, 168, 181, 200, 267, 289, 310, 325, 369, 395, - 419, 436, 487, 517, 544, 563, 612, 639, 662, 678, 721, 745, 764, - 777, 61, 69, 75, 87, 100, 114, 129, 144, 162, 180, 191, 207, - 288, 309, 324, 333, 394, 418, 435, 445, 516, 543, 562, 573, 638, - 661, 677, 686, 744, 763, 776, 783, 70, 79, 86, 97, 108, 122, - 137, 155, 242, 251, 266, 287, 339, 350, 368, 393, 452, 465, 486, - 515, 580, 592, 611, 637, 692, 703, 720, 743, 788, 798, 813, 833, - 84, 93, 103, 110, 125, 141, 154, 171, 250, 265, 286, 308, 349, - 367, 392, 417, 464, 485, 514, 542, 591, 610, 636, 660, 702, 719, - 742, 762, 797, 812, 832, 848, 98, 106, 115, 127, 143, 156, 169, - 185, 264, 285, 307, 323, 366, 391, 416, 434, 484, 513, 541, 561, - 609, 635, 659, 676, 718, 741, 761, 775, 811, 831, 847, 858, 117, - 128, 136, 148, 160, 175, 188, 198, 284, 306, 322, 332, 390, 415, - 433, 444, 512, 540, 560, 572, 634, 658, 675, 685, 740, 760, 774, - 782, 830, 846, 857, 863, 135, 146, 152, 165, 241, 249, 263, 283, - 338, 348, 365, 389, 451, 463, 483, 511, 579, 590, 608, 633, 691, - 701, 717, 739, 787, 796, 810, 829, 867, 875, 887, 903, 153, 166, - 174, 183, 248, 262, 282, 305, 347, 364, 388, 414, 462, 482, 510, - 539, 589, 607, 632, 657, 700, 716, 738, 759, 795, 809, 828, 845, - 874, 886, 902, 915, 176, 187, 195, 202, 261, 281, 304, 321, 363, - 387, 413, 432, 481, 509, 538, 559, 606, 631, 656, 674, 715, 737, - 758, 773, 808, 827, 844, 856, 885, 901, 914, 923, 192, 199, 206, - 213, 280, 303, 320, 331, 386, 412, 431, 443, 508, 537, 558, 571, - 630, 655, 673, 684, 736, 757, 772, 781, 826, 843, 855, 862, 900, - 913, 922, 927, 240, 247, 260, 279, 337, 346, 362, 385, 450, 461, - 480, 507, 578, 588, 605, 629, 690, 699, 714, 735, 786, 794, 807, - 825, 866, 873, 884, 899, 930, 936, 945, 957, 246, 259, 278, 302, - 345, 361, 384, 411, 460, 479, 506, 536, 587, 604, 628, 654, 698, - 713, 734, 756, 793, 806, 824, 842, 872, 883, 898, 912, 935, 944, - 956, 966, 258, 277, 301, 319, 360, 383, 410, 430, 478, 505, 535, - 557, 603, 627, 653, 672, 712, 733, 755, 771, 805, 823, 841, 854, - 882, 897, 911, 921, 943, 955, 965, 972, 276, 300, 318, 330, 382, - 409, 429, 442, 504, 534, 556, 570, 626, 652, 671, 683, 732, 754, - 770, 780, 822, 840, 853, 861, 896, 910, 920, 926, 954, 964, 971, - 975, 336, 344, 359, 381, 449, 459, 477, 503, 577, 586, 602, 625, - 689, 697, 711, 731, 785, 792, 804, 821, 865, 871, 881, 895, 929, - 934, 942, 953, 977, 981, 987, 995, 343, 358, 380, 408, 458, 476, - 502, 533, 585, 601, 624, 651, 696, 710, 730, 753, 791, 803, 820, - 839, 870, 880, 894, 909, 933, 941, 952, 963, 980, 986, 994, 1001, - 357, 379, 407, 428, 475, 501, 532, 555, 600, 623, 650, 670, 709, - 729, 752, 769, 802, 819, 838, 852, 879, 893, 908, 919, 940, 951, - 962, 970, 985, 993, 1000, 1005, 378, 406, 427, 441, 500, 531, 554, - 569, 622, 649, 669, 682, 728, 751, 768, 779, 818, 837, 851, 860, - 892, 907, 918, 925, 950, 961, 969, 974, 992, 999, 1004, 1007, 448, - 457, 474, 499, 576, 584, 599, 621, 688, 695, 708, 727, 784, 790, - 801, 817, 864, 869, 878, 891, 928, 932, 939, 949, 976, 979, 984, - 991, 1008, 1010, 1013, 1017, 456, 473, 498, 530, 583, 598, 620, 648, - 694, 707, 726, 750, 789, 800, 816, 836, 868, 877, 890, 906, 931, - 938, 948, 960, 978, 983, 990, 998, 1009, 1012, 1016, 1020, 472, 497, - 529, 553, 597, 619, 647, 668, 706, 725, 749, 767, 799, 815, 835, - 850, 876, 889, 905, 917, 937, 947, 959, 968, 982, 989, 997, 1003, - 1011, 1015, 1019, 1022, 496, 528, 552, 568, 618, 646, 667, 681, 724, - 748, 766, 778, 814, 834, 849, 859, 888, 904, 916, 924, 946, 958, - 967, 973, 988, 996, 1002, 1006, 1014, 1018, 1021, 1023, + 1, 3, 6, 11, 18, 26, 39, 48, 63, 84, 102, 122, 146, + 171, 194, 205, 211, 220, 230, 234, 246, 258, 276, 300, 343, 357, + 378, 406, 456, 472, 496, 528, 2, 5, 9, 16, 23, 31, 46, + 59, 75, 93, 113, 134, 159, 185, 204, 216, 223, 229, 235, 238, + 257, 275, 299, 318, 356, 377, 405, 427, 471, 495, 527, 552, 4, + 8, 13, 19, 29, 37, 53, 65, 83, 103, 119, 143, 165, 190, + 209, 218, 225, 232, 236, 239, 274, 298, 317, 330, 376, 404, 426, + 441, 494, 526, 551, 568, 7, 12, 17, 24, 32, 44, 61, 74, + 91, 110, 127, 151, 174, 197, 212, 221, 227, 233, 237, 240, 297, + 316, 329, 336, 403, 425, 440, 448, 525, 550, 567, 576, 10, 15, + 20, 30, 38, 51, 66, 79, 96, 117, 135, 158, 180, 202, 215, + 224, 245, 256, 273, 296, 342, 355, 375, 402, 455, 470, 493, 524, + 583, 597, 618, 646, 14, 21, 27, 36, 45, 55, 73, 86, 106, + 124, 141, 164, 183, 206, 217, 226, 255, 272, 295, 315, 354, 374, + 401, 424, 469, 492, 523, 549, 596, 617, 645, 667, 22, 28, 34, + 43, 54, 64, 81, 95, 114, 133, 152, 173, 191, 210, 219, 228, + 271, 294, 314, 328, 373, 400, 423, 439, 491, 522, 548, 566, 616, + 644, 666, 681, 25, 33, 40, 49, 58, 72, 89, 105, 121, 140, + 160, 179, 198, 213, 222, 231, 293, 313, 327, 335, 399, 422, 438, + 447, 521, 547, 565, 575, 643, 665, 680, 688, 35, 41, 47, 57, + 69, 82, 97, 112, 131, 148, 168, 187, 244, 254, 270, 292, 341, + 353, 372, 398, 454, 468, 490, 520, 582, 595, 615, 642, 694, 706, + 724, 748, 42, 50, 56, 68, 78, 92, 108, 125, 139, 162, 178, + 195, 253, 269, 291, 312, 352, 371, 397, 421, 467, 489, 519, 546, + 594, 614, 641, 664, 705, 723, 747, 766, 52, 60, 67, 77, 90, + 100, 120, 132, 150, 169, 182, 201, 268, 290, 311, 326, 370, 396, + 420, 437, 488, 518, 545, 564, 613, 640, 663, 679, 722, 746, 765, + 778, 62, 70, 76, 88, 101, 115, 130, 145, 163, 181, 192, 208, + 289, 310, 325, 334, 395, 419, 436, 446, 517, 544, 563, 574, 639, + 662, 678, 687, 745, 764, 777, 784, 71, 80, 87, 98, 109, 123, + 138, 156, 243, 252, 267, 288, 340, 351, 369, 394, 453, 466, 487, + 516, 581, 593, 612, 638, 693, 704, 721, 744, 789, 799, 814, 834, + 85, 94, 104, 111, 126, 142, 155, 172, 251, 266, 287, 309, 350, + 368, 393, 418, 465, 486, 515, 543, 592, 611, 637, 661, 703, 720, + 743, 763, 798, 813, 833, 849, 99, 107, 116, 128, 144, 157, 170, + 186, 265, 286, 308, 324, 367, 392, 417, 435, 485, 514, 542, 562, + 610, 636, 660, 677, 719, 742, 762, 776, 812, 832, 848, 859, 118, + 129, 137, 149, 161, 176, 189, 199, 285, 307, 323, 333, 391, 416, + 434, 445, 513, 541, 561, 573, 635, 659, 676, 686, 741, 761, 775, + 783, 831, 847, 858, 864, 136, 147, 153, 166, 242, 250, 264, 284, + 339, 349, 366, 390, 452, 464, 484, 512, 580, 591, 609, 634, 692, + 702, 718, 740, 788, 797, 811, 830, 868, 876, 888, 904, 154, 167, + 175, 184, 249, 263, 283, 306, 348, 365, 389, 415, 463, 483, 511, + 540, 590, 608, 633, 658, 701, 717, 739, 760, 796, 810, 829, 846, + 875, 887, 903, 916, 177, 188, 196, 203, 262, 282, 305, 322, 364, + 388, 414, 433, 482, 510, 539, 560, 607, 632, 657, 675, 716, 738, + 759, 774, 809, 828, 845, 857, 886, 902, 915, 924, 193, 200, 207, + 214, 281, 304, 321, 332, 387, 413, 432, 444, 509, 538, 559, 572, + 631, 656, 674, 685, 737, 758, 773, 782, 827, 844, 856, 863, 901, + 914, 923, 928, 241, 248, 261, 280, 338, 347, 363, 386, 451, 462, + 481, 508, 579, 589, 606, 630, 691, 700, 715, 736, 787, 795, 808, + 826, 867, 874, 885, 900, 931, 937, 946, 958, 247, 260, 279, 303, + 346, 362, 385, 412, 461, 480, 507, 537, 588, 605, 629, 655, 699, + 714, 735, 757, 794, 807, 825, 843, 873, 884, 899, 913, 936, 945, + 957, 967, 259, 278, 302, 320, 361, 384, 411, 431, 479, 506, 536, + 558, 604, 628, 654, 673, 713, 734, 756, 772, 806, 824, 842, 855, + 883, 898, 912, 922, 944, 956, 966, 973, 277, 301, 319, 331, 383, + 410, 430, 443, 505, 535, 557, 571, 627, 653, 672, 684, 733, 755, + 771, 781, 823, 841, 854, 862, 897, 911, 921, 927, 955, 965, 972, + 976, 337, 345, 360, 382, 450, 460, 478, 504, 578, 587, 603, 626, + 690, 698, 712, 732, 786, 793, 805, 822, 866, 872, 882, 896, 930, + 935, 943, 954, 978, 982, 988, 996, 344, 359, 381, 409, 459, 477, + 503, 534, 586, 602, 625, 652, 697, 711, 731, 754, 792, 804, 821, + 840, 871, 881, 895, 910, 934, 942, 953, 964, 981, 987, 995, 1002, + 358, 380, 408, 429, 476, 502, 533, 556, 601, 624, 651, 671, 710, + 730, 753, 770, 803, 820, 839, 853, 880, 894, 909, 920, 941, 952, + 963, 971, 986, 994, 1001, 1006, 379, 407, 428, 442, 501, 532, 555, + 570, 623, 650, 670, 683, 729, 752, 769, 780, 819, 838, 852, 861, + 893, 908, 919, 926, 951, 962, 970, 975, 993, 1000, 1005, 1008, 449, + 458, 475, 500, 577, 585, 600, 622, 689, 696, 709, 728, 785, 791, + 802, 818, 865, 870, 879, 892, 929, 933, 940, 950, 977, 980, 985, + 992, 1009, 1011, 1014, 1018, 457, 474, 499, 531, 584, 599, 621, 649, + 695, 708, 727, 751, 790, 801, 817, 837, 869, 878, 891, 907, 932, + 939, 949, 961, 979, 984, 991, 999, 1010, 1013, 1017, 1021, 473, 498, + 530, 554, 598, 620, 648, 669, 707, 726, 750, 768, 800, 816, 836, + 851, 877, 890, 906, 918, 938, 948, 960, 969, 983, 990, 998, 1004, + 1012, 1016, 1020, 1023, 497, 529, 553, 569, 619, 647, 668, 682, 725, + 749, 767, 779, 815, 835, 850, 860, 889, 905, 917, 925, 947, 959, + 968, 974, 989, 997, 1003, 1007, 1015, 1019, 1022, 1024, }; const scan_order vp9_default_scan_orders[TX_SIZES] = { diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c index 8a8d2ad86..db3e74663 100644 --- a/libvpx/vp9/decoder/vp9_decodemv.c +++ b/libvpx/vp9/decoder/vp9_decodemv.c @@ -426,7 +426,9 @@ static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd, zero_mv_pair(mv); break; } - default: { return 0; } + default: { + return 0; + } } return ret; } @@ -755,7 +757,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs, best_sub8x8, is_compound, allow_hp, r)) { xd->corrupted |= 1; - break; + return; } if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j]; diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c index c2e6b3d54..3ed1bd6ff 100644 --- a/libvpx/vp9/decoder/vp9_detokenize.c +++ b/libvpx/vp9/decoder/vp9_detokenize.c @@ -133,17 +133,18 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, int16_t dqv = dq[0]; const uint8_t *const cat6_prob = #if CONFIG_VP9_HIGHBITDEPTH - (xd->bd == VPX_BITS_12) - ? vp9_cat6_prob_high12 - : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 : + (xd->bd == VPX_BITS_12) ? vp9_cat6_prob_high12 + : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 + : #endif // CONFIG_VP9_HIGHBITDEPTH - vp9_cat6_prob; + vp9_cat6_prob; const int cat6_bits = #if CONFIG_VP9_HIGHBITDEPTH - (xd->bd == VPX_BITS_12) ? 18 - : (xd->bd == VPX_BITS_10) ? 16 : + (xd->bd == VPX_BITS_12) ? 18 + : (xd->bd == VPX_BITS_10) ? 16 + : #endif // CONFIG_VP9_HIGHBITDEPTH - 14; + 14; // Keep value, range, and count as locals. The compiler produces better // results with the locals than using r directly. BD_VALUE value = r->value; diff --git a/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c index a07a1608d..5961be5f3 100644 --- a/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -18,28 +18,30 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in, int stride) { - // { 0, 1, 1, 1, 1, 1, 1, 1 }; - const int16x8_t nonzero_bias_a = vextq_s16(vdupq_n_s16(0), vdupq_n_s16(1), 7); - // { 1, 0, 0, 0, 0, 0, 0, 0 }; - const int16x8_t nonzero_bias_b = vextq_s16(vdupq_n_s16(1), vdupq_n_s16(0), 7); - int16x8_t mask; + // { 0, 1, 1, 1 }; + const int16x4_t nonzero_bias_a = vext_s16(vdup_n_s16(0), vdup_n_s16(1), 3); + // { 1, 0, 0, 0 }; + const int16x4_t nonzero_bias_b = vext_s16(vdup_n_s16(1), vdup_n_s16(0), 3); + int16x4_t mask; int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); - in[0] = vcombine_s16(input_0, input_1); - in[1] = vcombine_s16(input_2, input_3); - // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by // one non-zero first elements - mask = vreinterpretq_s16_u16(vceqq_s16(in[0], nonzero_bias_a)); - in[0] = vaddq_s16(in[0], mask); - in[0] = vaddq_s16(in[0], nonzero_bias_b); + mask = vreinterpret_s16_u16(vceq_s16(input_0, nonzero_bias_a)); + input_0 = vadd_s16(input_0, mask); + input_0 = vadd_s16(input_0, nonzero_bias_b); + + in[0] = vcombine_s16(input_0, input_1); + in[1] = vcombine_s16(input_2, input_3); } static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) { @@ -53,72 +55,54 @@ static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) { } static INLINE void fadst4x4_neon(int16x8_t *in) { - int32x4_t u0, u1, u2, u3; - int16x4_t out_0, out_1, out_2, out_3; - const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); + int32x4_t u[4], t[4]; + int16x4_t s[4], out[4]; - const int16x4_t s0 = vget_low_s16(in[0]); // | x_00 | x_01 | x_02 | x_03 | - const int16x4_t s1 = vget_high_s16(in[0]); // | x_10 | x_11 | x_12 | x_13 | - const int16x4_t s2 = vget_low_s16(in[1]); // | x_20 | x_21 | x_22 | x_23 | - const int16x4_t s3 = vget_high_s16(in[1]); // | x_30 | x_31 | x_32 | x_33 | + s[0] = vget_low_s16(in[0]); // | x_00 | x_01 | x_02 | x_03 | + s[1] = vget_high_s16(in[0]); // | x_10 | x_11 | x_12 | x_13 | + s[2] = vget_low_s16(in[1]); // | x_20 | x_21 | x_22 | x_23 | + s[3] = vget_high_s16(in[1]); // | x_30 | x_31 | x_32 | x_33 | - // s0 * sinpi_1_9, s0 * sinpi_4_9 // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. - const int32x4_t s0s1_9 = vmull_n_s16(s0, sinpi_1_9); - const int32x4_t s0s4_9 = vmull_n_s16(s0, sinpi_4_9); - // s1 * sinpi_1_9, s1 * sinpi_2_9 - const int32x4_t s1s1_9 = vmull_n_s16(s1, sinpi_1_9); - const int32x4_t s1s2_9 = vmull_n_s16(s1, sinpi_2_9); - // s2 * sinpi_3_9 - const int32x4_t s2s3_9 = vmull_n_s16(s2, sinpi_3_9); - // s3 * sinpi_2_9, s3 * sinpi_4_9 - const int32x4_t s3s2_9 = vmull_n_s16(s3, sinpi_2_9); - const int32x4_t s3s4_9 = vmull_n_s16(s3, sinpi_4_9); - - // (s0 + s1) * sinpi_3_9 - const int32x4_t s0_p_s1 = vaddl_s16(s0, s1); - const int32x4_t s0_p_s1_m_s3 = vsubw_s16(s0_p_s1, s3); - - // s_0 * sinpi_1_9 + s_1 * sinpi_2_9 - // s_0 * sinpi_4_9 - s_1 * sinpi_1_9 - const int32x4_t s0s1_9_p_s1s2_9 = vaddq_s32(s0s1_9, s1s2_9); - const int32x4_t s0s4_9_m_s1s1_9 = vsubq_s32(s0s4_9, s1s1_9); - /* - * t0 = s0s1_9 + s1s2_9 + s3s4_9 - * t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 - * t2 = s0s4_9 - s1s1_9 + s3s2_9 - * t3 = s2s3_9 - */ - const int32x4_t t0 = vaddq_s32(s0s1_9_p_s1s2_9, s3s4_9); - const int32x4_t t1 = vmulq_n_s32(s0_p_s1_m_s3, sinpi_3_9); - const int32x4_t t2 = vaddq_s32(s0s4_9_m_s1s1_9, s3s2_9); - const int32x4_t t3 = s2s3_9; + // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9 + t[0] = vmull_n_s16(s[0], sinpi_1_9); + t[0] = vmlal_n_s16(t[0], s[1], sinpi_2_9); + t[0] = vmlal_n_s16(t[0], s[3], sinpi_4_9); + + // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 + t[1] = vmull_n_s16(s[0], sinpi_3_9); + t[1] = vmlal_n_s16(t[1], s[1], sinpi_3_9); + t[1] = vmlsl_n_s16(t[1], s[3], sinpi_3_9); + + // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9 + t[2] = vmull_n_s16(s[0], sinpi_4_9); + t[2] = vmlsl_n_s16(t[2], s[1], sinpi_1_9); + t[2] = vmlal_n_s16(t[2], s[3], sinpi_2_9); + + // t3 = s2 * sinpi_3_9 + t[3] = vmull_n_s16(s[2], sinpi_3_9); + /* * u0 = t0 + t3 * u1 = t1 * u2 = t2 - t3 * u3 = t2 - t0 + t3 */ - u0 = vaddq_s32(t0, t3); - u1 = t1; - u2 = vsubq_s32(t2, t3); - u3 = vaddq_s32(vsubq_s32(t2, t0), t3); + u[0] = vaddq_s32(t[0], t[3]); + u[1] = t[1]; + u[2] = vsubq_s32(t[2], t[3]); + u[3] = vaddq_s32(vsubq_s32(t[2], t[0]), t[3]); // fdct_round_shift - u0 = vaddq_s32(u0, k__DCT_CONST_ROUNDING); - u1 = vaddq_s32(u1, k__DCT_CONST_ROUNDING); - u2 = vaddq_s32(u2, k__DCT_CONST_ROUNDING); - u3 = vaddq_s32(u3, k__DCT_CONST_ROUNDING); - - out_0 = vshrn_n_s32(u0, DCT_CONST_BITS); - out_1 = vshrn_n_s32(u1, DCT_CONST_BITS); - out_2 = vshrn_n_s32(u2, DCT_CONST_BITS); - out_3 = vshrn_n_s32(u3, DCT_CONST_BITS); + out[0] = vrshrn_n_s32(u[0], DCT_CONST_BITS); + out[1] = vrshrn_n_s32(u[1], DCT_CONST_BITS); + out[2] = vrshrn_n_s32(u[2], DCT_CONST_BITS); + out[3] = vrshrn_n_s32(u[3], DCT_CONST_BITS); - transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); - in[0] = vcombine_s16(out_0, out_1); - in[1] = vcombine_s16(out_2, out_3); + in[0] = vcombine_s16(out[0], out[1]); + in[1] = vcombine_s16(out[2], out[3]); } void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, @@ -130,12 +114,14 @@ void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, case ADST_DCT: load_buffer_4x4(input, in, stride); fadst4x4_neon(in); - vpx_fdct4x4_pass1_neon((int16x4_t *)in); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); write_buffer_4x4(output, in); break; case DCT_ADST: load_buffer_4x4(input, in, stride); - vpx_fdct4x4_pass1_neon((int16x4_t *)in); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); fadst4x4_neon(in); write_buffer_4x4(output, in); break; @@ -235,245 +221,158 @@ static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res, } static INLINE void fadst8x8_neon(int16x8_t *in) { - int16x4_t x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi, x4_lo, - x4_hi, x5_lo, x5_hi, x6_lo, x6_hi, x7_lo, x7_hi; - int32x4_t s0_lo, s0_hi, s1_lo, s1_hi, s2_lo, s2_hi, s3_lo, s3_hi, s4_lo, - s4_hi, s5_lo, s5_hi, s6_lo, s6_hi, s7_lo, s7_hi; - int32x4_t t0_lo, t0_hi, t1_lo, t1_hi, t2_lo, t2_hi, t3_lo, t3_hi, t4_lo, - t4_hi, t5_lo, t5_hi, t6_lo, t6_hi, t7_lo, t7_hi; - const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); - - x0_lo = vget_low_s16(in[7]); - x0_hi = vget_high_s16(in[7]); - x1_lo = vget_low_s16(in[0]); - x1_hi = vget_high_s16(in[0]); - x2_lo = vget_low_s16(in[5]); - x2_hi = vget_high_s16(in[5]); - x3_lo = vget_low_s16(in[2]); - x3_hi = vget_high_s16(in[2]); - x4_lo = vget_low_s16(in[3]); - x4_hi = vget_high_s16(in[3]); - x5_lo = vget_low_s16(in[4]); - x5_hi = vget_high_s16(in[4]); - x6_lo = vget_low_s16(in[1]); - x6_hi = vget_high_s16(in[1]); - x7_lo = vget_low_s16(in[6]); - x7_hi = vget_high_s16(in[6]); + int16x4_t x_lo[8], x_hi[8]; + int32x4_t s_lo[8], s_hi[8]; + int32x4_t t_lo[8], t_hi[8]; + + x_lo[0] = vget_low_s16(in[7]); + x_hi[0] = vget_high_s16(in[7]); + x_lo[1] = vget_low_s16(in[0]); + x_hi[1] = vget_high_s16(in[0]); + x_lo[2] = vget_low_s16(in[5]); + x_hi[2] = vget_high_s16(in[5]); + x_lo[3] = vget_low_s16(in[2]); + x_hi[3] = vget_high_s16(in[2]); + x_lo[4] = vget_low_s16(in[3]); + x_hi[4] = vget_high_s16(in[3]); + x_lo[5] = vget_low_s16(in[4]); + x_hi[5] = vget_high_s16(in[4]); + x_lo[6] = vget_low_s16(in[1]); + x_hi[6] = vget_high_s16(in[1]); + x_lo[7] = vget_low_s16(in[6]); + x_hi[7] = vget_high_s16(in[6]); // stage 1 // s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s0_lo = vaddq_s32(vmull_n_s16(x0_lo, cospi_2_64), - vmull_n_s16(x1_lo, cospi_30_64)); - s0_hi = vaddq_s32(vmull_n_s16(x0_hi, cospi_2_64), - vmull_n_s16(x1_hi, cospi_30_64)); // s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s1_lo = vsubq_s32(vmull_n_s16(x0_lo, cospi_30_64), - vmull_n_s16(x1_lo, cospi_2_64)); - s1_hi = vsubq_s32(vmull_n_s16(x0_hi, cospi_30_64), - vmull_n_s16(x1_hi, cospi_2_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1], + cospi_2_64, cospi_30_64, &s_lo[0], + &s_hi[0], &s_lo[1], &s_hi[1]); + // s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s2_lo = vaddq_s32(vmull_n_s16(x2_lo, cospi_10_64), - vmull_n_s16(x3_lo, cospi_22_64)); - s2_hi = vaddq_s32(vmull_n_s16(x2_hi, cospi_10_64), - vmull_n_s16(x3_hi, cospi_22_64)); // s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s3_lo = vsubq_s32(vmull_n_s16(x2_lo, cospi_22_64), - vmull_n_s16(x3_lo, cospi_10_64)); - s3_hi = vsubq_s32(vmull_n_s16(x2_hi, cospi_22_64), - vmull_n_s16(x3_hi, cospi_10_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3], + cospi_10_64, cospi_22_64, &s_lo[2], + &s_hi[2], &s_lo[3], &s_hi[3]); + // s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s4_lo = vaddq_s32(vmull_n_s16(x4_lo, cospi_18_64), - vmull_n_s16(x5_lo, cospi_14_64)); - s4_hi = vaddq_s32(vmull_n_s16(x4_hi, cospi_18_64), - vmull_n_s16(x5_hi, cospi_14_64)); // s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s5_lo = vsubq_s32(vmull_n_s16(x4_lo, cospi_14_64), - vmull_n_s16(x5_lo, cospi_18_64)); - s5_hi = vsubq_s32(vmull_n_s16(x4_hi, cospi_14_64), - vmull_n_s16(x5_hi, cospi_18_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5], + cospi_18_64, cospi_14_64, &s_lo[4], + &s_hi[4], &s_lo[5], &s_hi[5]); + // s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s6_lo = vaddq_s32(vmull_n_s16(x6_lo, cospi_26_64), - vmull_n_s16(x7_lo, cospi_6_64)); - s6_hi = vaddq_s32(vmull_n_s16(x6_hi, cospi_26_64), - vmull_n_s16(x7_hi, cospi_6_64)); // s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - s7_lo = vsubq_s32(vmull_n_s16(x6_lo, cospi_6_64), - vmull_n_s16(x7_lo, cospi_26_64)); - s7_hi = vsubq_s32(vmull_n_s16(x6_hi, cospi_6_64), - vmull_n_s16(x7_hi, cospi_26_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7], + cospi_26_64, cospi_6_64, &s_lo[6], + &s_hi[6], &s_lo[7], &s_hi[7]); // fdct_round_shift - t0_lo = vaddq_s32(s0_lo, s4_lo); - t0_hi = vaddq_s32(s0_hi, s4_hi); - t1_lo = vaddq_s32(s1_lo, s5_lo); - t1_hi = vaddq_s32(s1_hi, s5_hi); - t2_lo = vaddq_s32(s2_lo, s6_lo); - t2_hi = vaddq_s32(s2_hi, s6_hi); - t3_lo = vaddq_s32(s3_lo, s7_lo); - t3_hi = vaddq_s32(s3_hi, s7_hi); - t4_lo = vsubq_s32(s0_lo, s4_lo); - t4_hi = vsubq_s32(s0_hi, s4_hi); - t5_lo = vsubq_s32(s1_lo, s5_lo); - t5_hi = vsubq_s32(s1_hi, s5_hi); - t6_lo = vsubq_s32(s2_lo, s6_lo); - t6_hi = vsubq_s32(s2_hi, s6_hi); - t7_lo = vsubq_s32(s3_lo, s7_lo); - t7_hi = vsubq_s32(s3_hi, s7_hi); - - t0_lo = vaddq_s32(t0_lo, k__DCT_CONST_ROUNDING); - t0_hi = vaddq_s32(t0_hi, k__DCT_CONST_ROUNDING); - t1_lo = vaddq_s32(t1_lo, k__DCT_CONST_ROUNDING); - t1_hi = vaddq_s32(t1_hi, k__DCT_CONST_ROUNDING); - t2_lo = vaddq_s32(t2_lo, k__DCT_CONST_ROUNDING); - t2_hi = vaddq_s32(t2_hi, k__DCT_CONST_ROUNDING); - t3_lo = vaddq_s32(t3_lo, k__DCT_CONST_ROUNDING); - t3_hi = vaddq_s32(t3_hi, k__DCT_CONST_ROUNDING); - t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING); - t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING); - t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING); - t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING); - t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING); - t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING); - t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING); - t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING); - - t0_lo = vshrq_n_s32(t0_lo, DCT_CONST_BITS); - t0_hi = vshrq_n_s32(t0_hi, DCT_CONST_BITS); - t1_lo = vshrq_n_s32(t1_lo, DCT_CONST_BITS); - t1_hi = vshrq_n_s32(t1_hi, DCT_CONST_BITS); - t2_lo = vshrq_n_s32(t2_lo, DCT_CONST_BITS); - t2_hi = vshrq_n_s32(t2_hi, DCT_CONST_BITS); - t3_lo = vshrq_n_s32(t3_lo, DCT_CONST_BITS); - t3_hi = vshrq_n_s32(t3_hi, DCT_CONST_BITS); - t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS); - t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS); - t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS); - t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS); - t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS); - t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS); - t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS); - t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS); + t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS); + t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS); + t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS); + t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS); + t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS); + t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS); + t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS); + t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS); + t_lo[4] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS); // stage 2 - s0_lo = t0_lo; - s0_hi = t0_hi; - s1_lo = t1_lo; - s1_hi = t1_hi; - s2_lo = t2_lo; - s2_hi = t2_hi; - s3_lo = t3_lo; - s3_hi = t3_hi; - s4_lo = vaddq_s32(vmulq_n_s32(t4_lo, cospi_8_64), - vmulq_n_s32(t5_lo, cospi_24_64)); - s4_hi = vaddq_s32(vmulq_n_s32(t4_hi, cospi_8_64), - vmulq_n_s32(t5_hi, cospi_24_64)); - s5_lo = vsubq_s32(vmulq_n_s32(t4_lo, cospi_24_64), - vmulq_n_s32(t5_lo, cospi_8_64)); - s5_hi = vsubq_s32(vmulq_n_s32(t4_hi, cospi_24_64), - vmulq_n_s32(t5_hi, cospi_8_64)); - s6_lo = vaddq_s32(vmulq_n_s32(t6_lo, -cospi_24_64), - vmulq_n_s32(t7_lo, cospi_8_64)); - s6_hi = vaddq_s32(vmulq_n_s32(t6_hi, -cospi_24_64), - vmulq_n_s32(t7_hi, cospi_8_64)); - s7_lo = vaddq_s32(vmulq_n_s32(t6_lo, cospi_8_64), - vmulq_n_s32(t7_lo, cospi_24_64)); - s7_hi = vaddq_s32(vmulq_n_s32(t6_hi, cospi_8_64), - vmulq_n_s32(t7_hi, cospi_24_64)); + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + // s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5], + cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4], + &s_lo[5], &s_hi[5]); + + // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + // s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + butterfly_two_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7], + -cospi_24_64, cospi_8_64, &s_lo[6], &s_hi[6], + &s_lo[7], &s_hi[7]); + // fdct_round_shift // s0 + s2 - t0_lo = vaddq_s32(s0_lo, s2_lo); - t0_hi = vaddq_s32(s0_hi, s2_hi); + t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]); // s1 + s3 - t1_lo = vaddq_s32(s1_lo, s3_lo); - t1_hi = vaddq_s32(s1_hi, s3_hi); + t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]); // s0 - s2 - t2_lo = vsubq_s32(s0_lo, s2_lo); - t2_hi = vsubq_s32(s0_hi, s2_hi); + t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]); + t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]); // s1 - s3 - t3_lo = vsubq_s32(s1_lo, s3_lo); - t3_hi = vsubq_s32(s1_hi, s3_hi); + t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]); + t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]); // s4 + s6 - t4_lo = vaddq_s32(s4_lo, s6_lo); - t4_hi = vaddq_s32(s4_hi, s6_hi); + t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS); // s5 + s7 - t5_lo = vaddq_s32(s5_lo, s7_lo); - t5_hi = vaddq_s32(s5_hi, s7_hi); + t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS); // s4 - s6 - t6_lo = vsubq_s32(s4_lo, s6_lo); - t6_hi = vsubq_s32(s4_hi, s6_hi); + t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS); // s5 - s7 - t7_lo = vsubq_s32(s5_lo, s7_lo); - t7_hi = vsubq_s32(s5_hi, s7_hi); - - // fdct_round_shift - t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING); - t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING); - t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING); - t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING); - t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING); - t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING); - t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING); - t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING); - t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS); - t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS); - t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS); - t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS); - t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS); - t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS); - t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS); - t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS); // stage 3 // cospi_16_64 * (x2 + x3) - s2_lo = vmulq_n_s32(vaddq_s32(t2_lo, t3_lo), cospi_16_64); - s2_hi = vmulq_n_s32(vaddq_s32(t2_hi, t3_hi), cospi_16_64); // cospi_16_64 * (x2 - x3) - s3_lo = vmulq_n_s32(vsubq_s32(t2_lo, t3_lo), cospi_16_64); - s3_hi = vmulq_n_s32(vsubq_s32(t2_hi, t3_hi), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[2], t_hi[2], t_lo[3], t_hi[3], + cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3], + &s_hi[3]); + // cospi_16_64 * (x6 + x7) - s6_lo = vmulq_n_s32(vaddq_s32(t6_lo, t7_lo), cospi_16_64); - s6_hi = vmulq_n_s32(vaddq_s32(t6_hi, t7_hi), cospi_16_64); // cospi_16_64 * (x2 - x3) - s7_lo = vmulq_n_s32(vsubq_s32(t6_lo, t7_lo), cospi_16_64); - s7_hi = vmulq_n_s32(vsubq_s32(t6_hi, t7_hi), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7], + cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7], + &s_hi[7]); // final fdct_round_shift - t2_lo = vaddq_s32(s2_lo, k__DCT_CONST_ROUNDING); - t2_hi = vaddq_s32(s2_hi, k__DCT_CONST_ROUNDING); - t3_lo = vaddq_s32(s3_lo, k__DCT_CONST_ROUNDING); - t3_hi = vaddq_s32(s3_hi, k__DCT_CONST_ROUNDING); - t6_lo = vaddq_s32(s6_lo, k__DCT_CONST_ROUNDING); - t6_hi = vaddq_s32(s6_hi, k__DCT_CONST_ROUNDING); - t7_lo = vaddq_s32(s7_lo, k__DCT_CONST_ROUNDING); - t7_hi = vaddq_s32(s7_hi, k__DCT_CONST_ROUNDING); - - x2_lo = vshrn_n_s32(t2_lo, DCT_CONST_BITS); - x2_hi = vshrn_n_s32(t2_hi, DCT_CONST_BITS); - x3_lo = vshrn_n_s32(t3_lo, DCT_CONST_BITS); - x3_hi = vshrn_n_s32(t3_hi, DCT_CONST_BITS); - x6_lo = vshrn_n_s32(t6_lo, DCT_CONST_BITS); - x6_hi = vshrn_n_s32(t6_hi, DCT_CONST_BITS); - x7_lo = vshrn_n_s32(t7_lo, DCT_CONST_BITS); - x7_hi = vshrn_n_s32(t7_hi, DCT_CONST_BITS); + x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS); + x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS); + x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS); + x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS); + x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS); + x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS); + x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS); + x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS); // x0, x1, x4, x5 narrow down to 16-bits directly - x0_lo = vmovn_s32(t0_lo); - x0_hi = vmovn_s32(t0_hi); - x1_lo = vmovn_s32(t1_lo); - x1_hi = vmovn_s32(t1_hi); - x4_lo = vmovn_s32(t4_lo); - x4_hi = vmovn_s32(t4_hi); - x5_lo = vmovn_s32(t5_lo); - x5_hi = vmovn_s32(t5_hi); - - in[0] = vcombine_s16(x0_lo, x0_hi); - in[1] = vnegq_s16(vcombine_s16(x4_lo, x4_hi)); - in[2] = vcombine_s16(x6_lo, x6_hi); - in[3] = vnegq_s16(vcombine_s16(x2_lo, x2_hi)); - in[4] = vcombine_s16(x3_lo, x3_hi); - in[5] = vnegq_s16(vcombine_s16(x7_lo, x7_hi)); - in[6] = vcombine_s16(x5_lo, x5_hi); - in[7] = vnegq_s16(vcombine_s16(x1_lo, x1_hi)); + x_lo[0] = vmovn_s32(t_lo[0]); + x_hi[0] = vmovn_s32(t_hi[0]); + x_lo[1] = vmovn_s32(t_lo[1]); + x_hi[1] = vmovn_s32(t_hi[1]); + x_lo[4] = vmovn_s32(t_lo[4]); + x_hi[4] = vmovn_s32(t_hi[4]); + x_lo[5] = vmovn_s32(t_lo[5]); + x_hi[5] = vmovn_s32(t_hi[5]); + + in[0] = vcombine_s16(x_lo[0], x_hi[0]); + in[1] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4])); + in[2] = vcombine_s16(x_lo[6], x_hi[6]); + in[3] = vnegq_s16(vcombine_s16(x_lo[2], x_hi[2])); + in[4] = vcombine_s16(x_lo[3], x_hi[3]); + in[5] = vnegq_s16(vcombine_s16(x_lo[7], x_hi[7])); + in[6] = vcombine_s16(x_lo[5], x_hi[5]); + in[7] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1])); transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], &in[7]); @@ -488,13 +387,15 @@ void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, case ADST_DCT: load_buffer_8x8(input, in, stride); fadst8x8_neon(in); - vpx_fdct8x8_pass1_neon(in); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; case DCT_ADST: load_buffer_8x8(input, in, stride); - vpx_fdct8x8_pass1_neon(in); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); fadst8x8_neon(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); @@ -547,7 +448,6 @@ static void fdct16_8col(int16x8_t *in) { int16x8_t i[8], s1[8], s2[8], s3[8], t[8]; int16x4_t t_lo[8], t_hi[8]; int32x4_t u_lo[8], u_hi[8]; - const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); // stage 1 i[0] = vaddq_s16(in[0], in[15]); @@ -559,7 +459,8 @@ static void fdct16_8col(int16x8_t *in) { i[6] = vaddq_s16(in[6], in[9]); i[7] = vaddq_s16(in[7], in[8]); - vpx_fdct8x8_pass1_neon(i); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(i); transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]); // step 2 @@ -595,23 +496,14 @@ static void fdct16_8col(int16x8_t *in) { u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64); u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64); - u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING); - u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING); - u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING); - u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING); - u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING); - u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING); - u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING); - u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING); - - t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS); - t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS); - t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS); - t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS); - t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS); - t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS); - t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS); - t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS); + t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS); + t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); s2[2] = vcombine_s16(t_lo[2], t_hi[2]); s2[3] = vcombine_s16(t_lo[3], t_hi[3]); @@ -646,40 +538,26 @@ static void fdct16_8col(int16x8_t *in) { t_lo[7] = vget_low_s16(s3[7]); t_hi[7] = vget_high_s16(s3[7]); - u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_8_64), - vmull_n_s16(t_lo[6], cospi_24_64)); - u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_8_64), - vmull_n_s16(t_hi[6], cospi_24_64)); - u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_24_64), - vmull_n_s16(t_lo[5], cospi_8_64)); - u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_24_64), - vmull_n_s16(t_hi[5], cospi_8_64)); - u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_8_64), - vmull_n_s16(t_lo[5], -cospi_24_64)); - u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_8_64), - vmull_n_s16(t_hi[5], -cospi_24_64)); - u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_24_64), - vmull_n_s16(t_lo[6], cospi_8_64)); - u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_24_64), - vmull_n_s16(t_hi[6], cospi_8_64)); - - u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING); - u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING); - u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING); - u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING); - u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING); - u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING); - u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING); - u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING); - - t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS); - t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS); - t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS); - t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS); - t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS); - t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS); - t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS); - t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS); + // u[1] = -cospi_8_64 * t[1] + cospi_24_64 * t[6] + // u[6] = cospi_24_64 * t[1] + cospi_8_64 * t[6] + butterfly_two_coeff_s16_s32_noround(t_lo[1], t_hi[1], t_lo[6], t_hi[6], + -cospi_8_64, cospi_24_64, &u_lo[1], + &u_hi[1], &u_lo[6], &u_hi[6]); + + // u[5] = -cospi_24_64 * t[5] + cospi_8_64 * t[2] + // u[2] = cospi_8_64 * t[5] + cospi_24_64 * t[2] + butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2], + -cospi_24_64, cospi_8_64, &u_lo[5], + &u_hi[5], &u_lo[2], &u_hi[2]); + + t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS); + t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS); + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS); s2[1] = vcombine_s16(t_lo[1], t_hi[1]); s2[2] = vcombine_s16(t_lo[2], t_hi[2]); @@ -714,88 +592,47 @@ static void fdct16_8col(int16x8_t *in) { t_lo[7] = vget_low_s16(s1[7]); t_hi[7] = vget_high_s16(s1[7]); - // step1[0] * cospi_30_64 + step1[7] * cospi_2_64; - u_lo[0] = vaddq_s32(vmull_n_s16(t_lo[0], cospi_30_64), - vmull_n_s16(t_lo[7], cospi_2_64)); - u_hi[0] = vaddq_s32(vmull_n_s16(t_hi[0], cospi_30_64), - vmull_n_s16(t_hi[7], cospi_2_64)); - - // step1[1] * cospi_14_64 + step1[6] * cospi_18_64; - u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_14_64), - vmull_n_s16(t_lo[6], cospi_18_64)); - u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_14_64), - vmull_n_s16(t_hi[6], cospi_18_64)); - - // step1[2] * cospi_22_64 + step1[5] * cospi_10_64; - u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_22_64), - vmull_n_s16(t_lo[5], cospi_10_64)); - u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_22_64), - vmull_n_s16(t_hi[5], cospi_10_64)); - - // step1[3] * cospi_6_64 + step1[4] * cospi_26_64; - u_lo[3] = vaddq_s32(vmull_n_s16(t_lo[3], cospi_6_64), - vmull_n_s16(t_lo[4], cospi_26_64)); - u_hi[3] = vaddq_s32(vmull_n_s16(t_hi[3], cospi_6_64), - vmull_n_s16(t_hi[4], cospi_26_64)); - - // step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; - u_lo[4] = vaddq_s32(vmull_n_s16(t_lo[3], -cospi_26_64), - vmull_n_s16(t_lo[4], cospi_6_64)); - u_hi[4] = vaddq_s32(vmull_n_s16(t_hi[3], -cospi_26_64), - vmull_n_s16(t_hi[4], cospi_6_64)); - - // step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; - u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], -cospi_10_64), - vmull_n_s16(t_lo[5], cospi_22_64)); - u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], -cospi_10_64), - vmull_n_s16(t_hi[5], cospi_22_64)); - - // step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; - u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_18_64), - vmull_n_s16(t_lo[6], cospi_14_64)); - u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_18_64), - vmull_n_s16(t_hi[6], cospi_14_64)); - - // step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; - u_lo[7] = vaddq_s32(vmull_n_s16(t_lo[0], -cospi_2_64), - vmull_n_s16(t_lo[7], cospi_30_64)); - u_hi[7] = vaddq_s32(vmull_n_s16(t_hi[0], -cospi_2_64), - vmull_n_s16(t_hi[7], cospi_30_64)); + // u[0] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64 + // u[7] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64 + butterfly_two_coeff_s16_s32_noround(t_lo[7], t_hi[7], t_lo[0], t_hi[0], + cospi_2_64, cospi_30_64, &u_lo[0], + &u_hi[0], &u_lo[7], &u_hi[7]); + + // u[1] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64 + // u[6] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64 + butterfly_two_coeff_s16_s32_noround(t_lo[6], t_hi[6], t_lo[1], t_hi[1], + cospi_18_64, cospi_14_64, &u_lo[1], + &u_hi[1], &u_lo[6], &u_hi[6]); + + // u[2] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64 + // u[5] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64 + butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2], + cospi_10_64, cospi_22_64, &u_lo[2], + &u_hi[2], &u_lo[5], &u_hi[5]); + + // u[3] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64 + // u[4] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64 + butterfly_two_coeff_s16_s32_noround(t_lo[4], t_hi[4], t_lo[3], t_hi[3], + cospi_26_64, cospi_6_64, &u_lo[3], + &u_hi[3], &u_lo[4], &u_hi[4]); // final fdct_round_shift - u_lo[0] = vaddq_s32(u_lo[0], k__DCT_CONST_ROUNDING); - u_hi[0] = vaddq_s32(u_hi[0], k__DCT_CONST_ROUNDING); - u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING); - u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING); - u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING); - u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING); - u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING); - u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING); - u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING); - u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING); - u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING); - u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING); - u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING); - u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING); - u_lo[7] = vaddq_s32(u_lo[7], k__DCT_CONST_ROUNDING); - u_hi[7] = vaddq_s32(u_hi[7], k__DCT_CONST_ROUNDING); - - t_lo[0] = vshrn_n_s32(u_lo[0], DCT_CONST_BITS); - t_hi[0] = vshrn_n_s32(u_hi[0], DCT_CONST_BITS); - t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS); - t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS); - t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS); - t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS); - t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS); - t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS); - t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS); - t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS); - t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS); - t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS); - t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS); - t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS); - t_lo[7] = vshrn_n_s32(u_lo[7], DCT_CONST_BITS); - t_hi[7] = vshrn_n_s32(u_hi[7], DCT_CONST_BITS); + t_lo[0] = vrshrn_n_s32(u_lo[0], DCT_CONST_BITS); + t_hi[0] = vrshrn_n_s32(u_hi[0], DCT_CONST_BITS); + t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS); + t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS); + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS); + t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS); + t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS); + t_lo[7] = vrshrn_n_s32(u_lo[7], DCT_CONST_BITS); + t_hi[7] = vrshrn_n_s32(u_hi[7], DCT_CONST_BITS); in[0] = i[0]; in[2] = i[1]; @@ -820,7 +657,6 @@ static void fadst16_8col(int16x8_t *in) { int16x4_t x_lo[16], x_hi[16]; int32x4_t s_lo[16], s_hi[16]; int32x4_t t_lo[16], t_hi[16]; - const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING); x_lo[0] = vget_low_s16(in[15]); x_hi[0] = vget_high_s16(in[15]); @@ -857,185 +693,79 @@ static void fadst16_8col(int16x8_t *in) { // stage 1 // s0 = cospi_1_64 * x0 + cospi_31_64 * x1; - s_lo[0] = vaddq_s32(vmull_n_s16(x_lo[0], cospi_1_64), - vmull_n_s16(x_lo[1], cospi_31_64)); - s_hi[0] = vaddq_s32(vmull_n_s16(x_hi[0], cospi_1_64), - vmull_n_s16(x_hi[1], cospi_31_64)); // s1 = cospi_31_64 * x0 - cospi_1_64 * x1; - s_lo[1] = vsubq_s32(vmull_n_s16(x_lo[0], cospi_31_64), - vmull_n_s16(x_lo[1], cospi_1_64)); - s_hi[1] = vsubq_s32(vmull_n_s16(x_hi[0], cospi_31_64), - vmull_n_s16(x_hi[1], cospi_1_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1], + cospi_1_64, cospi_31_64, &s_lo[0], + &s_hi[0], &s_lo[1], &s_hi[1]); // s2 = cospi_5_64 * x2 + cospi_27_64 * x3; - s_lo[2] = vaddq_s32(vmull_n_s16(x_lo[2], cospi_5_64), - vmull_n_s16(x_lo[3], cospi_27_64)); - s_hi[2] = vaddq_s32(vmull_n_s16(x_hi[2], cospi_5_64), - vmull_n_s16(x_hi[3], cospi_27_64)); // s3 = cospi_27_64 * x2 - cospi_5_64 * x3; - s_lo[3] = vsubq_s32(vmull_n_s16(x_lo[2], cospi_27_64), - vmull_n_s16(x_lo[3], cospi_5_64)); - s_hi[3] = vsubq_s32(vmull_n_s16(x_hi[2], cospi_27_64), - vmull_n_s16(x_hi[3], cospi_5_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3], + cospi_5_64, cospi_27_64, &s_lo[2], + &s_hi[2], &s_lo[3], &s_hi[3]); // s4 = cospi_9_64 * x4 + cospi_23_64 * x5; - s_lo[4] = vaddq_s32(vmull_n_s16(x_lo[4], cospi_9_64), - vmull_n_s16(x_lo[5], cospi_23_64)); - s_hi[4] = vaddq_s32(vmull_n_s16(x_hi[4], cospi_9_64), - vmull_n_s16(x_hi[5], cospi_23_64)); // s5 = cospi_23_64 * x4 - cospi_9_64 * x5; - s_lo[5] = vsubq_s32(vmull_n_s16(x_lo[4], cospi_23_64), - vmull_n_s16(x_lo[5], cospi_9_64)); - s_hi[5] = vsubq_s32(vmull_n_s16(x_hi[4], cospi_23_64), - vmull_n_s16(x_hi[5], cospi_9_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5], + cospi_9_64, cospi_23_64, &s_lo[4], + &s_hi[4], &s_lo[5], &s_hi[5]); // s6 = cospi_13_64 * x6 + cospi_19_64 * x7; - s_lo[6] = vaddq_s32(vmull_n_s16(x_lo[6], cospi_13_64), - vmull_n_s16(x_lo[7], cospi_19_64)); - s_hi[6] = vaddq_s32(vmull_n_s16(x_hi[6], cospi_13_64), - vmull_n_s16(x_hi[7], cospi_19_64)); // s7 = cospi_19_64 * x6 - cospi_13_64 * x7; - s_lo[7] = vsubq_s32(vmull_n_s16(x_lo[6], cospi_19_64), - vmull_n_s16(x_lo[7], cospi_13_64)); - s_hi[7] = vsubq_s32(vmull_n_s16(x_hi[6], cospi_19_64), - vmull_n_s16(x_hi[7], cospi_13_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7], + cospi_13_64, cospi_19_64, &s_lo[6], + &s_hi[6], &s_lo[7], &s_hi[7]); // s8 = cospi_17_64 * x8 + cospi_15_64 * x9; - s_lo[8] = vaddq_s32(vmull_n_s16(x_lo[8], cospi_17_64), - vmull_n_s16(x_lo[9], cospi_15_64)); - s_hi[8] = vaddq_s32(vmull_n_s16(x_hi[8], cospi_17_64), - vmull_n_s16(x_hi[9], cospi_15_64)); // s9 = cospi_15_64 * x8 - cospi_17_64 * x9; - s_lo[9] = vsubq_s32(vmull_n_s16(x_lo[8], cospi_15_64), - vmull_n_s16(x_lo[9], cospi_17_64)); - s_hi[9] = vsubq_s32(vmull_n_s16(x_hi[8], cospi_15_64), - vmull_n_s16(x_hi[9], cospi_17_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[8], x_hi[8], x_lo[9], x_hi[9], + cospi_17_64, cospi_15_64, &s_lo[8], + &s_hi[8], &s_lo[9], &s_hi[9]); // s10 = cospi_21_64 * x10 + cospi_11_64 * x11; - s_lo[10] = vaddq_s32(vmull_n_s16(x_lo[10], cospi_21_64), - vmull_n_s16(x_lo[11], cospi_11_64)); - s_hi[10] = vaddq_s32(vmull_n_s16(x_hi[10], cospi_21_64), - vmull_n_s16(x_hi[11], cospi_11_64)); // s11 = cospi_11_64 * x10 - cospi_21_64 * x11; - s_lo[11] = vsubq_s32(vmull_n_s16(x_lo[10], cospi_11_64), - vmull_n_s16(x_lo[11], cospi_21_64)); - s_hi[11] = vsubq_s32(vmull_n_s16(x_hi[10], cospi_11_64), - vmull_n_s16(x_hi[11], cospi_21_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[10], x_hi[10], x_lo[11], x_hi[11], + cospi_21_64, cospi_11_64, &s_lo[10], + &s_hi[10], &s_lo[11], &s_hi[11]); // s12 = cospi_25_64 * x12 + cospi_7_64 * x13; - s_lo[12] = vaddq_s32(vmull_n_s16(x_lo[12], cospi_25_64), - vmull_n_s16(x_lo[13], cospi_7_64)); - s_hi[12] = vaddq_s32(vmull_n_s16(x_hi[12], cospi_25_64), - vmull_n_s16(x_hi[13], cospi_7_64)); // s13 = cospi_7_64 * x12 - cospi_25_64 * x13; - s_lo[13] = vsubq_s32(vmull_n_s16(x_lo[12], cospi_7_64), - vmull_n_s16(x_lo[13], cospi_25_64)); - s_hi[13] = vsubq_s32(vmull_n_s16(x_hi[12], cospi_7_64), - vmull_n_s16(x_hi[13], cospi_25_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[12], x_hi[12], x_lo[13], x_hi[13], + cospi_25_64, cospi_7_64, &s_lo[12], + &s_hi[12], &s_lo[13], &s_hi[13]); // s14 = cospi_29_64 * x14 + cospi_3_64 * x15; - s_lo[14] = vaddq_s32(vmull_n_s16(x_lo[14], cospi_29_64), - vmull_n_s16(x_lo[15], cospi_3_64)); - s_hi[14] = vaddq_s32(vmull_n_s16(x_hi[14], cospi_29_64), - vmull_n_s16(x_hi[15], cospi_3_64)); // s15 = cospi_3_64 * x14 - cospi_29_64 * x15; - s_lo[15] = vsubq_s32(vmull_n_s16(x_lo[14], cospi_3_64), - vmull_n_s16(x_lo[15], cospi_29_64)); - s_hi[15] = vsubq_s32(vmull_n_s16(x_hi[14], cospi_3_64), - vmull_n_s16(x_hi[15], cospi_29_64)); + butterfly_two_coeff_s16_s32_noround(x_lo[14], x_hi[14], x_lo[15], x_hi[15], + cospi_29_64, cospi_3_64, &s_lo[14], + &s_hi[14], &s_lo[15], &s_hi[15]); // fdct_round_shift - t_lo[0] = vaddq_s32(s_lo[0], s_lo[8]); - t_hi[0] = vaddq_s32(s_hi[0], s_hi[8]); - t_lo[1] = vaddq_s32(s_lo[1], s_lo[9]); - t_hi[1] = vaddq_s32(s_hi[1], s_hi[9]); - t_lo[2] = vaddq_s32(s_lo[2], s_lo[10]); - t_hi[2] = vaddq_s32(s_hi[2], s_hi[10]); - t_lo[3] = vaddq_s32(s_lo[3], s_lo[11]); - t_hi[3] = vaddq_s32(s_hi[3], s_hi[11]); - t_lo[4] = vaddq_s32(s_lo[4], s_lo[12]); - t_hi[4] = vaddq_s32(s_hi[4], s_hi[12]); - t_lo[5] = vaddq_s32(s_lo[5], s_lo[13]); - t_hi[5] = vaddq_s32(s_hi[5], s_hi[13]); - t_lo[6] = vaddq_s32(s_lo[6], s_lo[14]); - t_hi[6] = vaddq_s32(s_hi[6], s_hi[14]); - t_lo[7] = vaddq_s32(s_lo[7], s_lo[15]); - t_hi[7] = vaddq_s32(s_hi[7], s_hi[15]); - t_lo[8] = vsubq_s32(s_lo[0], s_lo[8]); - t_hi[8] = vsubq_s32(s_hi[0], s_hi[8]); - t_lo[9] = vsubq_s32(s_lo[1], s_lo[9]); - t_hi[9] = vsubq_s32(s_hi[1], s_hi[9]); - t_lo[10] = vsubq_s32(s_lo[2], s_lo[10]); - t_hi[10] = vsubq_s32(s_hi[2], s_hi[10]); - t_lo[11] = vsubq_s32(s_lo[3], s_lo[11]); - t_hi[11] = vsubq_s32(s_hi[3], s_hi[11]); - t_lo[12] = vsubq_s32(s_lo[4], s_lo[12]); - t_hi[12] = vsubq_s32(s_hi[4], s_hi[12]); - t_lo[13] = vsubq_s32(s_lo[5], s_lo[13]); - t_hi[13] = vsubq_s32(s_hi[5], s_hi[13]); - t_lo[14] = vsubq_s32(s_lo[6], s_lo[14]); - t_hi[14] = vsubq_s32(s_hi[6], s_hi[14]); - t_lo[15] = vsubq_s32(s_lo[7], s_lo[15]); - t_hi[15] = vsubq_s32(s_hi[7], s_hi[15]); - - t_lo[0] = vaddq_s32(t_lo[0], k__DCT_CONST_ROUNDING); - t_hi[0] = vaddq_s32(t_hi[0], k__DCT_CONST_ROUNDING); - t_lo[1] = vaddq_s32(t_lo[1], k__DCT_CONST_ROUNDING); - t_hi[1] = vaddq_s32(t_hi[1], k__DCT_CONST_ROUNDING); - t_lo[2] = vaddq_s32(t_lo[2], k__DCT_CONST_ROUNDING); - t_hi[2] = vaddq_s32(t_hi[2], k__DCT_CONST_ROUNDING); - t_lo[3] = vaddq_s32(t_lo[3], k__DCT_CONST_ROUNDING); - t_hi[3] = vaddq_s32(t_hi[3], k__DCT_CONST_ROUNDING); - t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING); - t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING); - t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING); - t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING); - t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING); - t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING); - t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING); - t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING); - t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING); - t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING); - t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING); - t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING); - t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING); - t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING); - t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING); - t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING); - t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING); - t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING); - t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING); - t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING); - t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING); - t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING); - t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING); - t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING); - - t_lo[0] = vshrq_n_s32(t_lo[0], DCT_CONST_BITS); - t_hi[0] = vshrq_n_s32(t_hi[0], DCT_CONST_BITS); - t_lo[1] = vshrq_n_s32(t_lo[1], DCT_CONST_BITS); - t_hi[1] = vshrq_n_s32(t_hi[1], DCT_CONST_BITS); - t_lo[2] = vshrq_n_s32(t_lo[2], DCT_CONST_BITS); - t_hi[2] = vshrq_n_s32(t_hi[2], DCT_CONST_BITS); - t_lo[3] = vshrq_n_s32(t_lo[3], DCT_CONST_BITS); - t_hi[3] = vshrq_n_s32(t_hi[3], DCT_CONST_BITS); - t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS); - t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS); - t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS); - t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS); - t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS); - t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS); - t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS); - t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS); - t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS); - t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS); - t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS); - t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS); - t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS); - t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS); - t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS); - t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS); - t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS); - t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS); - t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS); - t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS); - t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS); - t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS); - t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS); - t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS); + t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS); + t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS); + t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS); + t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS); + t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS); + t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS); + t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS); + t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS); + t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(vaddq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vaddq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(vaddq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vaddq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS); + t_lo[8] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS); + t_hi[8] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS); + t_lo[9] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS); + t_hi[9] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS); + t_lo[10] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS); + t_hi[10] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS); + t_lo[11] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS); + t_hi[11] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(vsubq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(vsubq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(vsubq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(vsubq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS); // stage 2 s_lo[0] = t_lo[0]; @@ -1055,45 +785,25 @@ static void fadst16_8col(int16x8_t *in) { s_lo[7] = t_lo[7]; s_hi[7] = t_hi[7]; // s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s_lo[8] = vaddq_s32(vmulq_n_s32(t_lo[8], cospi_4_64), - vmulq_n_s32(t_lo[9], cospi_28_64)); - s_hi[8] = vaddq_s32(vmulq_n_s32(t_hi[8], cospi_4_64), - vmulq_n_s32(t_hi[9], cospi_28_64)); // s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s_lo[9] = vsubq_s32(vmulq_n_s32(t_lo[8], cospi_28_64), - vmulq_n_s32(t_lo[9], cospi_4_64)); - s_hi[9] = vsubq_s32(vmulq_n_s32(t_hi[8], cospi_28_64), - vmulq_n_s32(t_hi[9], cospi_4_64)); + butterfly_two_coeff_s32_noround(t_lo[8], t_hi[8], t_lo[9], t_hi[9], + cospi_4_64, cospi_28_64, &s_lo[8], &s_hi[8], + &s_lo[9], &s_hi[9]); // s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s_lo[10] = vaddq_s32(vmulq_n_s32(t_lo[10], cospi_20_64), - vmulq_n_s32(t_lo[11], cospi_12_64)); - s_hi[10] = vaddq_s32(vmulq_n_s32(t_hi[10], cospi_20_64), - vmulq_n_s32(t_hi[11], cospi_12_64)); // s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s_lo[11] = vsubq_s32(vmulq_n_s32(t_lo[10], cospi_12_64), - vmulq_n_s32(t_lo[11], cospi_20_64)); - s_hi[11] = vsubq_s32(vmulq_n_s32(t_hi[10], cospi_12_64), - vmulq_n_s32(t_hi[11], cospi_20_64)); + butterfly_two_coeff_s32_noround(t_lo[10], t_hi[10], t_lo[11], t_hi[11], + cospi_20_64, cospi_12_64, &s_lo[10], + &s_hi[10], &s_lo[11], &s_hi[11]); // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], -cospi_28_64), - vmulq_n_s32(t_lo[13], cospi_4_64)); - s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], -cospi_28_64), - vmulq_n_s32(t_hi[13], cospi_4_64)); // s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_4_64), - vmulq_n_s32(t_lo[13], cospi_28_64)); - s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_4_64), - vmulq_n_s32(t_hi[13], cospi_28_64)); + butterfly_two_coeff_s32_noround(t_lo[13], t_hi[13], t_lo[12], t_hi[12], + cospi_28_64, cospi_4_64, &s_lo[13], &s_hi[13], + &s_lo[12], &s_hi[12]); // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_12_64), - vmulq_n_s32(t_lo[15], cospi_20_64)); - s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_12_64), - vmulq_n_s32(t_hi[15], cospi_20_64)); // s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_20_64), - vmulq_n_s32(t_lo[15], cospi_12_64)); - s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_20_64), - vmulq_n_s32(t_hi[15], cospi_12_64)); + butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + cospi_12_64, cospi_20_64, &s_lo[15], + &s_hi[15], &s_lo[14], &s_hi[14]); // s0 + s4 t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]); @@ -1144,38 +854,22 @@ static void fadst16_8col(int16x8_t *in) { t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]); t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]); - t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING); - t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING); - t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING); - t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING); - t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING); - t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING); - t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING); - t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING); - t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING); - t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING); - t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING); - t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING); - t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING); - t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING); - t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING); - t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING); - t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS); - t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS); - t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS); - t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS); - t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS); - t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS); - t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS); - t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS); - t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS); - t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS); - t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS); - t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS); - t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS); - t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS); - t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS); - t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS); + t_lo[8] = vrshrq_n_s32(t_lo[8], DCT_CONST_BITS); + t_hi[8] = vrshrq_n_s32(t_hi[8], DCT_CONST_BITS); + t_lo[9] = vrshrq_n_s32(t_lo[9], DCT_CONST_BITS); + t_hi[9] = vrshrq_n_s32(t_hi[9], DCT_CONST_BITS); + t_lo[10] = vrshrq_n_s32(t_lo[10], DCT_CONST_BITS); + t_hi[10] = vrshrq_n_s32(t_hi[10], DCT_CONST_BITS); + t_lo[11] = vrshrq_n_s32(t_lo[11], DCT_CONST_BITS); + t_hi[11] = vrshrq_n_s32(t_hi[11], DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS); // stage 3 s_lo[0] = t_lo[0]; @@ -1187,25 +881,15 @@ static void fadst16_8col(int16x8_t *in) { s_lo[3] = t_lo[3]; s_hi[3] = t_hi[3]; // s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s_lo[4] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_8_64), - vmulq_n_s32(t_lo[5], cospi_24_64)); - s_hi[4] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_8_64), - vmulq_n_s32(t_hi[5], cospi_24_64)); // s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s_lo[5] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_24_64), - vmulq_n_s32(t_lo[5], -cospi_8_64)); - s_hi[5] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_24_64), - vmulq_n_s32(t_hi[5], -cospi_8_64)); + butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5], + cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4], + &s_lo[5], &s_hi[5]); // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s_lo[6] = vaddq_s32(vmulq_n_s32(t_lo[6], -cospi_24_64), - vmulq_n_s32(t_lo[7], cospi_8_64)); - s_hi[6] = vaddq_s32(vmulq_n_s32(t_hi[6], -cospi_24_64), - vmulq_n_s32(t_hi[7], cospi_8_64)); // s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s_lo[7] = vaddq_s32(vmulq_n_s32(t_lo[6], cospi_8_64), - vmulq_n_s32(t_lo[7], cospi_24_64)); - s_hi[7] = vaddq_s32(vmulq_n_s32(t_hi[6], cospi_8_64), - vmulq_n_s32(t_hi[7], cospi_24_64)); + butterfly_two_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_24_64, cospi_8_64, &s_lo[7], &s_hi[7], + &s_lo[6], &s_hi[6]); s_lo[8] = t_lo[8]; s_hi[8] = t_hi[8]; s_lo[9] = t_lo[9]; @@ -1215,25 +899,15 @@ static void fadst16_8col(int16x8_t *in) { s_lo[11] = t_lo[11]; s_hi[11] = t_hi[11]; // s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_8_64), - vmulq_n_s32(t_lo[13], cospi_24_64)); - s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_8_64), - vmulq_n_s32(t_hi[13], cospi_24_64)); // s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_24_64), - vmulq_n_s32(t_lo[13], -cospi_8_64)); - s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_24_64), - vmulq_n_s32(t_hi[13], -cospi_8_64)); + butterfly_two_coeff_s32_noround(t_lo[12], t_hi[12], t_lo[13], t_hi[13], + cospi_8_64, cospi_24_64, &s_lo[12], &s_hi[12], + &s_lo[13], &s_hi[13]); // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_24_64), - vmulq_n_s32(t_lo[15], cospi_8_64)); - s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_24_64), - vmulq_n_s32(t_hi[15], cospi_8_64)); // s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_8_64), - vmulq_n_s32(t_lo[15], cospi_24_64)); - s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_8_64), - vmulq_n_s32(t_hi[15], cospi_24_64)); + butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + cospi_24_64, cospi_8_64, &s_lo[15], &s_hi[15], + &s_lo[14], &s_hi[14]); // s0 + s4 t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]); @@ -1284,99 +958,62 @@ static void fadst16_8col(int16x8_t *in) { t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]); t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]); - t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING); - t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING); - t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING); - t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING); - t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING); - t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING); - t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING); - t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING); - t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING); - t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING); - t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING); - t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING); - t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING); - t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING); - t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING); - t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING); - t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS); - t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS); - t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS); - t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS); - t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS); - t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS); - t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS); - t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS); - t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS); - t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS); - t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS); - t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS); - t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS); - t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS); - t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS); - t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS); + t_lo[4] = vrshrq_n_s32(t_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(t_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(t_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(t_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(t_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(t_hi[6], DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(t_lo[7], DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(t_hi[7], DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS); // stage 4 // s2 = (-cospi_16_64) * (x2 + x3); - s_lo[2] = vmulq_n_s32(vaddq_s32(t_lo[2], t_lo[3]), -cospi_16_64); - s_hi[2] = vmulq_n_s32(vaddq_s32(t_hi[2], t_hi[3]), -cospi_16_64); // s3 = cospi_16_64 * (x2 - x3); - s_lo[3] = vmulq_n_s32(vsubq_s32(t_lo[2], t_lo[3]), cospi_16_64); - s_hi[3] = vmulq_n_s32(vsubq_s32(t_hi[2], t_hi[3]), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[3], t_hi[3], t_lo[2], t_hi[2], + -cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3], + &s_hi[3]); // s6 = cospi_16_64 * (x6 + x7); - s_lo[6] = vmulq_n_s32(vaddq_s32(t_lo[6], t_lo[7]), cospi_16_64); - s_hi[6] = vmulq_n_s32(vaddq_s32(t_hi[6], t_hi[7]), cospi_16_64); // s7 = cospi_16_64 * (-x6 + x7); - s_lo[7] = vmulq_n_s32(vsubq_s32(t_lo[7], t_lo[6]), cospi_16_64); - s_hi[7] = vmulq_n_s32(vsubq_s32(t_hi[7], t_hi[6]), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7], + &s_hi[7]); // s10 = cospi_16_64 * (x10 + x11); - s_lo[10] = vmulq_n_s32(vaddq_s32(t_lo[10], t_lo[11]), cospi_16_64); - s_hi[10] = vmulq_n_s32(vaddq_s32(t_hi[10], t_hi[11]), cospi_16_64); // s11 = cospi_16_64 * (-x10 + x11); - s_lo[11] = vmulq_n_s32(vsubq_s32(t_lo[11], t_lo[10]), cospi_16_64); - s_hi[11] = vmulq_n_s32(vsubq_s32(t_hi[11], t_hi[10]), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[11], t_hi[11], t_lo[10], t_hi[10], + cospi_16_64, &s_lo[10], &s_hi[10], &s_lo[11], + &s_hi[11]); // s14 = (-cospi_16_64) * (x14 + x15); - s_lo[14] = vmulq_n_s32(vaddq_s32(t_lo[14], t_lo[15]), -cospi_16_64); - s_hi[14] = vmulq_n_s32(vaddq_s32(t_hi[14], t_hi[15]), -cospi_16_64); // s15 = cospi_16_64 * (x14 - x15); - s_lo[15] = vmulq_n_s32(vsubq_s32(t_lo[14], t_lo[15]), cospi_16_64); - s_hi[15] = vmulq_n_s32(vsubq_s32(t_hi[14], t_hi[15]), cospi_16_64); + butterfly_one_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + -cospi_16_64, &s_lo[14], &s_hi[14], &s_lo[15], + &s_hi[15]); // final fdct_round_shift - t_lo[2] = vaddq_s32(s_lo[2], k__DCT_CONST_ROUNDING); - t_hi[2] = vaddq_s32(s_hi[2], k__DCT_CONST_ROUNDING); - t_lo[3] = vaddq_s32(s_lo[3], k__DCT_CONST_ROUNDING); - t_hi[3] = vaddq_s32(s_hi[3], k__DCT_CONST_ROUNDING); - t_lo[6] = vaddq_s32(s_lo[6], k__DCT_CONST_ROUNDING); - t_hi[6] = vaddq_s32(s_hi[6], k__DCT_CONST_ROUNDING); - t_lo[7] = vaddq_s32(s_lo[7], k__DCT_CONST_ROUNDING); - t_hi[7] = vaddq_s32(s_hi[7], k__DCT_CONST_ROUNDING); - t_lo[10] = vaddq_s32(s_lo[10], k__DCT_CONST_ROUNDING); - t_hi[10] = vaddq_s32(s_hi[10], k__DCT_CONST_ROUNDING); - t_lo[11] = vaddq_s32(s_lo[11], k__DCT_CONST_ROUNDING); - t_hi[11] = vaddq_s32(s_hi[11], k__DCT_CONST_ROUNDING); - t_lo[14] = vaddq_s32(s_lo[14], k__DCT_CONST_ROUNDING); - t_hi[14] = vaddq_s32(s_hi[14], k__DCT_CONST_ROUNDING); - t_lo[15] = vaddq_s32(s_lo[15], k__DCT_CONST_ROUNDING); - t_hi[15] = vaddq_s32(s_hi[15], k__DCT_CONST_ROUNDING); - - x_lo[2] = vshrn_n_s32(t_lo[2], DCT_CONST_BITS); - x_hi[2] = vshrn_n_s32(t_hi[2], DCT_CONST_BITS); - x_lo[3] = vshrn_n_s32(t_lo[3], DCT_CONST_BITS); - x_hi[3] = vshrn_n_s32(t_hi[3], DCT_CONST_BITS); - x_lo[6] = vshrn_n_s32(t_lo[6], DCT_CONST_BITS); - x_hi[6] = vshrn_n_s32(t_hi[6], DCT_CONST_BITS); - x_lo[7] = vshrn_n_s32(t_lo[7], DCT_CONST_BITS); - x_hi[7] = vshrn_n_s32(t_hi[7], DCT_CONST_BITS); - x_lo[10] = vshrn_n_s32(t_lo[10], DCT_CONST_BITS); - x_hi[10] = vshrn_n_s32(t_hi[10], DCT_CONST_BITS); - x_lo[11] = vshrn_n_s32(t_lo[11], DCT_CONST_BITS); - x_hi[11] = vshrn_n_s32(t_hi[11], DCT_CONST_BITS); - x_lo[14] = vshrn_n_s32(t_lo[14], DCT_CONST_BITS); - x_hi[14] = vshrn_n_s32(t_hi[14], DCT_CONST_BITS); - x_lo[15] = vshrn_n_s32(t_lo[15], DCT_CONST_BITS); - x_hi[15] = vshrn_n_s32(t_hi[15], DCT_CONST_BITS); + x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS); + x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS); + x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS); + x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS); + x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS); + x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS); + x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS); + x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS); + x_lo[10] = vrshrn_n_s32(s_lo[10], DCT_CONST_BITS); + x_hi[10] = vrshrn_n_s32(s_hi[10], DCT_CONST_BITS); + x_lo[11] = vrshrn_n_s32(s_lo[11], DCT_CONST_BITS); + x_hi[11] = vrshrn_n_s32(s_hi[11], DCT_CONST_BITS); + x_lo[14] = vrshrn_n_s32(s_lo[14], DCT_CONST_BITS); + x_hi[14] = vrshrn_n_s32(s_hi[14], DCT_CONST_BITS); + x_lo[15] = vrshrn_n_s32(s_lo[15], DCT_CONST_BITS); + x_hi[15] = vrshrn_n_s32(s_hi[15], DCT_CONST_BITS); // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly x_lo[0] = vmovn_s32(t_lo[0]); @@ -1458,3 +1095,137 @@ void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, break; } } + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void highbd_load_buffer_4x4(const int16_t *input, + int32x4_t *in /*[4]*/, int stride) { + // { 0, 1, 1, 1 }; + const int32x4_t nonzero_bias_a = vextq_s32(vdupq_n_s32(0), vdupq_n_s32(1), 3); + // { 1, 0, 0, 0 }; + const int32x4_t nonzero_bias_b = vextq_s32(vdupq_n_s32(1), vdupq_n_s32(0), 3); + int32x4_t mask; + + in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4); + + // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by + // one non-zero first elements + mask = vreinterpretq_s32_u32(vceqq_s32(in[0], nonzero_bias_a)); + in[0] = vaddq_s32(in[0], mask); + in[0] = vaddq_s32(in[0], nonzero_bias_b); +} + +static INLINE void highbd_write_buffer_4x4(tran_low_t *output, int32x4_t *res) { + const int32x4_t one = vdupq_n_s32(1); + res[0] = vshrq_n_s32(vaddq_s32(res[0], one), 2); + res[1] = vshrq_n_s32(vaddq_s32(res[1], one), 2); + res[2] = vshrq_n_s32(vaddq_s32(res[2], one), 2); + res[3] = vshrq_n_s32(vaddq_s32(res[3], one), 2); + vst1q_s32(output + 0 * 4, res[0]); + vst1q_s32(output + 1 * 4, res[1]); + vst1q_s32(output + 2 * 4, res[2]); + vst1q_s32(output + 3 * 4, res[3]); +} + +static INLINE void highbd_fadst4x4_neon(int32x4_t *in /*[4]*/) { + int32x2_t s_lo[4], s_hi[4]; + int64x2_t u_lo[4], u_hi[4], t_lo[4], t_hi[4]; + + s_lo[0] = vget_low_s32(in[0]); + s_hi[0] = vget_high_s32(in[0]); + s_lo[1] = vget_low_s32(in[1]); + s_hi[1] = vget_high_s32(in[1]); + s_lo[2] = vget_low_s32(in[2]); + s_hi[2] = vget_high_s32(in[2]); + s_lo[3] = vget_low_s32(in[3]); + s_hi[3] = vget_high_s32(in[3]); + + // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9 + t_lo[0] = vmull_n_s32(s_lo[0], sinpi_1_9); + t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[1], sinpi_2_9); + t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[3], sinpi_4_9); + t_hi[0] = vmull_n_s32(s_hi[0], sinpi_1_9); + t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[1], sinpi_2_9); + t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[3], sinpi_4_9); + + // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 + t_lo[1] = vmull_n_s32(s_lo[0], sinpi_3_9); + t_lo[1] = vmlal_n_s32(t_lo[1], s_lo[1], sinpi_3_9); + t_lo[1] = vmlsl_n_s32(t_lo[1], s_lo[3], sinpi_3_9); + t_hi[1] = vmull_n_s32(s_hi[0], sinpi_3_9); + t_hi[1] = vmlal_n_s32(t_hi[1], s_hi[1], sinpi_3_9); + t_hi[1] = vmlsl_n_s32(t_hi[1], s_hi[3], sinpi_3_9); + + // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9 + t_lo[2] = vmull_n_s32(s_lo[0], sinpi_4_9); + t_lo[2] = vmlsl_n_s32(t_lo[2], s_lo[1], sinpi_1_9); + t_lo[2] = vmlal_n_s32(t_lo[2], s_lo[3], sinpi_2_9); + t_hi[2] = vmull_n_s32(s_hi[0], sinpi_4_9); + t_hi[2] = vmlsl_n_s32(t_hi[2], s_hi[1], sinpi_1_9); + t_hi[2] = vmlal_n_s32(t_hi[2], s_hi[3], sinpi_2_9); + + // t3 = s2 * sinpi_3_9 + t_lo[3] = vmull_n_s32(s_lo[2], sinpi_3_9); + t_hi[3] = vmull_n_s32(s_hi[2], sinpi_3_9); + + /* + * u0 = t0 + t3 + * u1 = t1 + * u2 = t2 - t3 + * u3 = t2 - t0 + t3 + */ + u_lo[0] = vaddq_s64(t_lo[0], t_lo[3]); + u_hi[0] = vaddq_s64(t_hi[0], t_hi[3]); + u_lo[1] = t_lo[1]; + u_hi[1] = t_hi[1]; + u_lo[2] = vsubq_s64(t_lo[2], t_lo[3]); + u_hi[2] = vsubq_s64(t_hi[2], t_hi[3]); + u_lo[3] = vaddq_s64(vsubq_s64(t_lo[2], t_lo[0]), t_lo[3]); + u_hi[3] = vaddq_s64(vsubq_s64(t_hi[2], t_hi[0]), t_hi[3]); + + // fdct_round_shift + in[0] = vcombine_s32(vrshrn_n_s64(u_lo[0], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[0], DCT_CONST_BITS)); + in[1] = vcombine_s32(vrshrn_n_s64(u_lo[1], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[1], DCT_CONST_BITS)); + in[2] = vcombine_s32(vrshrn_n_s64(u_lo[2], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[2], DCT_CONST_BITS)); + in[3] = vcombine_s32(vrshrn_n_s64(u_lo[3], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[3], DCT_CONST_BITS)); + + transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]); +} + +void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t in[4]; + // int i; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct4x4_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_4x4(input, in, stride); + highbd_fadst4x4_neon(in); + vpx_highbd_fdct4x4_pass1_neon(in); + highbd_write_buffer_4x4(output, in); + break; + case DCT_ADST: + highbd_load_buffer_4x4(input, in, stride); + vpx_highbd_fdct4x4_pass1_neon(in); + highbd_fadst4x4_neon(in); + highbd_write_buffer_4x4(output, in); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_4x4(input, in, stride); + highbd_fadst4x4_neon(in); + highbd_fadst4x4_neon(in); + highbd_write_buffer_4x4(output, in); + break; + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c new file mode 100644 index 000000000..33753f77b --- /dev/null +++ b/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <arm_neon.h> + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vpx_ports/mem.h" + +#ifdef __GNUC__ +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) +#else +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) +#endif + +static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { + int_mv result; + result.as_mv.row = row; + result.as_mv.col = col; + return result; +} + +static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { + // This is simplified from the C implementation to utilise that + // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and + // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] + return mv.as_int == 0 ? 0 : 1; +} + +static INLINE int mv_cost(const int_mv mv, const int *joint_cost, + int *const comp_cost[2]) { + assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX); + assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX); + return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + + comp_cost[1][mv.as_mv.col]; +} + +static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, + int sad_per_bit) { + const int_mv diff = + pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, + VP9_PROB_COST_SHIFT); +} + +/***************************************************************************** + * This function utilizes 3 properties of the cost function lookup tables, * + * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * + * vp9_encoder.c. * + * For the joint cost: * + * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * + * For the component costs: * + * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * + * (Equal costs for both components) * + * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * + * (Cost function is even) * + * If these do not hold, then this function cannot be used without * + * modification, in which case you can revert to using the C implementation, * + * which does not rely on these properties. * + *****************************************************************************/ +int vp9_diamond_search_sad_neon(const MACROBLOCK *x, + const search_site_config *cfg, MV *ref_mv, + MV *best_mv, int search_param, int sad_per_bit, + int *num00, const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { + static const uint32_t data[4] = { 0, 1, 2, 3 }; + const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); + + const int32x4_t zero_s32 = vdupq_n_s32(0); + const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); + const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int)); + const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); + const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int)); + + const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit); + + const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]); + const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]); + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; + const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; + const int tot_steps = cfg->total_steps - search_param; + + const int_mv fcenter_mv = + pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); + const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int)); + + const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); + const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); + + int_mv bmv = pack_int_mv(ref_row, ref_col); + int_mv new_bmv = bmv; + int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); + + const int what_stride = x->plane[0].src.stride; + const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; + const uint8_t *const what = x->plane[0].src.buf; + const uint8_t *const in_what = + x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; + + // Work out the start point for the search + const uint8_t *best_address = in_what; + const uint8_t *new_best_address = best_address; +#if defined(__aarch64__) + int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address); +#else + int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); +#endif + unsigned int best_sad = INT_MAX; + int i, j, step; + + // Check the prerequisite cost function properties that are easy to check + // in an assert. See the function-level documentation for details on all + // prerequisites. + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); + + // Check the starting position + best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); + best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); + + *num00 = 0; + + for (i = 0, step = 0; step < tot_steps; step++) { + for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { + int16x8_t v_diff_mv_w; + int8x16_t v_inside_d; + uint32x4_t v_outside_d; + int32x4_t v_cost_d, v_sad_d; +#if defined(__aarch64__) + int64x2_t v_blocka[2]; +#else + int32x4_t v_blocka[1]; + uint32x2_t horiz_max_0, horiz_max_1; +#endif + + uint32_t horiz_max; + // Compute the candidate motion vectors + const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]); + const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w); + // Clamp them to the search bounds + int16x8_t v_these_mv_clamp_w = v_these_mv_w; + v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w); + v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w); + // The ones that did not change are inside the search area + v_inside_d = vreinterpretq_s8_u32( + vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w), + vreinterpretq_s32_s16(v_these_mv_w))); + + // If none of them are inside, then move on +#if defined(__aarch64__) + horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d)); +#else + horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)), + vget_high_u32(vreinterpretq_u32_s8(v_inside_d))); + horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0); + vst1_lane_u32(&horiz_max, horiz_max_1, 0); +#endif + if (LIKELY(horiz_max == 0)) { + continue; + } + + // The inverse mask indicates which of the MVs are outside + v_outside_d = + vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff))); + // Shift right to keep the sign bit clear, we will use this later + // to set the cost to the maximum value. + v_outside_d = vshrq_n_u32(v_outside_d, 1); + + // Compute the difference MV + v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv); + // We utilise the fact that the cost function is even, and use the + // absolute difference. This allows us to use unsigned indexes later + // and reduces cache pressure somewhat as only a half of the table + // is ever referenced. + v_diff_mv_w = vabsq_s16(v_diff_mv_w); + + // Compute the SIMD pointer offsets. + { +#if defined(__aarch64__) // sizeof(intptr_t) == 8 + // Load the offsets + int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]); + int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]); + // Set the ones falling outside to zero + v_bo10_q = vandq_s64( + v_bo10_q, + vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d)))); + v_bo32_q = vandq_s64( + v_bo32_q, + vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d)))); + // Compute the candidate addresses + v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q); + v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q); +#else // sizeof(intptr_t) == 4 + int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]); + v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d)); + v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d); +#endif + } + + fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); + + // Look up the component cost of the residual motion vector + { + uint32_t cost[4]; + int16_t __attribute__((aligned(16))) rowcol[8]; + vst1q_s16(rowcol, v_diff_mv_w); + + // Note: This is a use case for gather instruction + cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]]; + cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]]; + cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]]; + cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]]; + + v_cost_d = vld1q_s32((int32_t *)cost); + } + + // Now add in the joint cost + { + const uint32x4_t v_sel_d = + vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32); + const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8( + vbslq_u8(vreinterpretq_u8_u32(v_sel_d), + vreinterpretq_u8_s32(v_joint_cost_0_d), + vreinterpretq_u8_s32(v_joint_cost_1_d))); + v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d); + } + + // Multiply by sad_per_bit + v_cost_d = vmulq_s32(v_cost_d, v_spb_d); + // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT) + v_cost_d = + vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1))); + v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT); + // Add the cost to the sad + v_sad_d = vaddq_s32(v_sad_d, v_cost_d); + + // Make the motion vectors outside the search area have max cost + // by or'ing in the comparison mask, this way the minimum search won't + // pick them. + v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d)); + + // Find the minimum value and index horizontally in v_sad_d + { + uint32_t local_best_sad; +#if defined(__aarch64__) + local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d)); +#else + uint32x2_t horiz_min_0 = + vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)), + vget_high_u32(vreinterpretq_u32_s32(v_sad_d))); + uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); + vst1_lane_u32(&local_best_sad, horiz_min_1, 0); +#endif + + // Update the global minimum if the local minimum is smaller + if (LIKELY(local_best_sad < best_sad)) { +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + uint32_t local_best_idx; + const uint32x4_t v_sel_d = + vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad)); + uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d); + v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff)); + +#if defined(__aarch64__) + local_best_idx = vminvq_u32(v_mask_d); +#else + horiz_min_0 = + vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d)); + horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); + vst1_lane_u32(&local_best_idx, horiz_min_1, 0); +#endif + + new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; + + best_sad = local_best_sad; + } + } + } + + bmv = new_bmv; + best_address = new_best_address; + + v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); +#if defined(__aarch64__) + v_ba_q = vdupq_n_s64((intptr_t)best_address); +#else + v_ba_d = vdupq_n_s32((intptr_t)best_address); +#endif + + if (UNLIKELY(best_address == in_what)) { + (*num00)++; + } + } + + *best_mv = bmv.as_mv; + return best_sad; +} diff --git a/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c index e46f789ba..bc8dd4a34 100644 --- a/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c +++ b/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c @@ -14,6 +14,7 @@ #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_dsp/vpx_filter.h" @@ -710,8 +711,8 @@ void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; - const int dst_uv_w = dst_w / 2; - const int dst_uv_h = dst_h / 2; + const int dst_uv_w = dst->uv_crop_width; + const int dst_uv_h = dst->uv_crop_height; int scaled = 0; // phase_scaler is usually 0 or 8. diff --git a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c index 236c3176c..c2b55fcba 100644 --- a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -26,9 +26,8 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/vpx_dsp_common.h" -static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, - const int16x8_t dequant, - tran_low_t *dqcoeff) { +static VPX_FORCE_INLINE void calculate_dqcoeff_and_store( + const int16x8_t qcoeff, const int16x8_t dequant, tran_low_t *dqcoeff) { const int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); const int32x4_t dqcoeff_1 = @@ -42,6 +41,82 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, #endif // CONFIG_VP9_HIGHBITDEPTH } +static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr, + int16x8_t v_eobmax, + uint16x8_t v_nz_mask) { + const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan); + return vmaxq_s16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { +#ifdef __aarch64__ + return (uint16_t)vmaxvq_s16(v_eobmax); +#else + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + return (uint16_t)vget_lane_s16(v_eobmax_final, 0); +#endif // __aarch64__ +} + +static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *dequant_ptr, + int16x8_t *round, int16x8_t *quant, + int16x8_t *dequant) { + *round = vld1q_s16(round_ptr); + *quant = vld1q_s16(quant_ptr); + *dequant = vld1q_s16(dequant_ptr); +} + +static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round, + int16x8_t *v_quant, + int16x8_t *v_dequant) { +#ifdef __aarch64__ + *v_round = vdupq_laneq_s16(*v_round, 1); + *v_quant = vdupq_laneq_s16(*v_quant, 1); + *v_dequant = vdupq_laneq_s16(*v_dequant, 1); +#else + *v_round = vdupq_lane_s16(vget_low_s16(*v_round), 1); + *v_quant = vdupq_lane_s16(vget_low_s16(*v_quant), 1); + *v_dequant = vdupq_lane_s16(vget_low_s16(*v_dequant), 1); +#endif +} + +static VPX_FORCE_INLINE void quantize_fp_8( + const int16x8_t *v_round, const int16x8_t *v_quant, + const int16x8_t *v_dequant, const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + int16x8_t *v_eobmax) { + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, *v_round); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(*v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(*v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + calculate_dqcoeff_and_store(v_qcoeff, *v_dequant, dqcoeff_ptr); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + + *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); +} + void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, @@ -50,136 +125,54 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. int i; - const int16x8_t v_zero = vdupq_n_s16(0); - const int16x8_t v_one = vdupq_n_s16(1); - int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); - int16x8_t v_round = vmovq_n_s16(round_ptr[1]); - int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); - int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); - + int16x8_t v_eobmax = vdupq_n_s16(-1); + int16x8_t v_round, v_quant, v_dequant; (void)scan; - // adjust for dc - v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); - v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); - v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant, + &v_dequant); // process dc and the first seven ac coeffs - { - const int16x8_t v_iscan = vld1q_s16(&iscan[0]); - const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_abs = vabsq_s16(v_coeff); - const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); - const int32x4_t v_tmp_lo = - vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); - const int32x4_t v_tmp_hi = - vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); - const int16x8_t v_tmp2 = - vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); - const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); - const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); - const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); - calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); - v_round = vmovq_n_s16(round_ptr[1]); - v_quant = vmovq_n_s16(quant_ptr[1]); - v_dequant = vmovq_n_s16(dequant_ptr[1]); - } + quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr, + dqcoeff_ptr, &v_eobmax); + // now process the rest of the ac coeffs + update_fp_values(&v_round, &v_quant, &v_dequant); for (i = 8; i < count; i += 8) { - const int16x8_t v_iscan = vld1q_s16(&iscan[i]); - const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i); - const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_abs = vabsq_s16(v_coeff); - const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); - const int32x4_t v_tmp_lo = - vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); - const int32x4_t v_tmp_hi = - vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); - const int16x8_t v_tmp2 = - vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); - const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); - const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); - const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); - calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); + quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i, + qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax); } -#ifdef __aarch64__ - *eob_ptr = vmaxvq_s16(v_eobmax_76543210); -#else - { - const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), - vget_high_s16(v_eobmax_76543210)); - const int64x1_t v_eobmax_xx32 = - vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); - const int16x4_t v_eobmax_tmp = - vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); - const int64x1_t v_eobmax_xxx3 = - vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); - const int16x4_t v_eobmax_final = - vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); - - *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); - } -#endif // __aarch64__ + + *eob_ptr = get_max_eob(v_eobmax); } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); } -void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, - const int16_t *round_ptr, - const int16_t *quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); - const int16x8_t neg_one = vdupq_n_s16(-1); - - // ROUND_POWER_OF_TWO(round_ptr[], 1) - const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - const int16x8_t quant = vld1q_s16(quant_ptr); - const int16x4_t dequant = vld1_s16(dequant_ptr); - // dequant >> 2 is used similar to zbin as a threshold. - const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); +static VPX_FORCE_INLINE void quantize_fp_32x32_8( + const int16x8_t *v_round, const int16x8_t *v_quant, + const int16x8_t *v_dequant, const int16x8_t *dequant_thresh, + const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t *v_eobmax) { + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_coeff_abs = vabsq_s16(v_coeff); + const int16x8_t v_thr_mask = + vreinterpretq_s16_u16(vcgeq_s16(v_coeff_abs, *dequant_thresh)); + const int16x8_t v_tmp_rnd = + vandq_s16(vqaddq_s16(v_coeff_abs, *v_round), v_thr_mask); + const int16x8_t v_abs_qcoeff = vqdmulhq_s16(v_tmp_rnd, *v_quant); + const int16x8_t v_qcoeff = + vsubq_s16(veorq_s16(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + const uint16x8_t v_nz_mask = vceqq_s16(v_abs_qcoeff, vdupq_n_s16(0)); - // Process dc and the first seven ac coeffs. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - const int16x8_t dequant_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); - - int16x8_t qcoeff = vqaddq_s16(coeff_abs, round); int32x4_t dqcoeff_0, dqcoeff_1; - uint16x8_t eob_max; - (void)scan; - (void)count; - - // coeff * quant_ptr[]) >> 15 - qcoeff = vqdmulhq_s16(qcoeff, quant); - - // Restore sign. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - qcoeff = vandq_s16(qcoeff, dequant_mask); - - // qcoeff * dequant[] / 2 - dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant); - dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]); - + dqcoeff_0 = vmull_s16(vget_low_s16(v_qcoeff), vget_low_s16(*v_dequant)); + dqcoeff_1 = vmull_s16(vget_high_s16(v_qcoeff), vget_high_s16(*v_dequant)); // Add 1 if negative to round towards zero because the C uses division. dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + #if CONFIG_VP9_HIGHBITDEPTH vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1)); vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1)); @@ -188,76 +181,228 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, vshrn_n_s32(dqcoeff_1, 1))); #endif - eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + + *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); +} + +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int16x8_t eob_max = vdupq_n_s16(-1); + // ROUND_POWER_OF_TWO(round_ptr[], 1) + int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t dequant = vld1q_s16(dequant_ptr); + // dequant >> 2 is used similar to zbin as a threshold. + int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); + int i; + + (void)scan; + (void)count; + + // Process dc and the first seven ac coeffs. + quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, + iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max); - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + update_fp_values(&round, &quant, &dequant); + dequant_thresh = vdupq_lane_s16(vget_low_s16(dequant_thresh), 1); iscan += 8; coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; - { - int i; - const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1); - const int16x8_t quant = vmovq_n_s16(quant_ptr[1]); - const int16x8_t dequant_thresh = - vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2); - - // Process the rest of the ac coeffs. - for (i = 8; i < 32 * 32; i += 8) { - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - const int16x8_t dequant_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh)); - - int16x8_t qcoeff = vqaddq_s16(coeff_abs, round); - int32x4_t dqcoeff_0, dqcoeff_1; - - qcoeff = vqdmulhq_s16(qcoeff, quant); - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - qcoeff = vandq_s16(qcoeff, dequant_mask); - - dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]); - dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]); - - dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); - dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + // Process the rest of the ac coeffs. + for (i = 8; i < 32 * 32; i += 8) { + quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, + iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max); + + iscan += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + *eob_ptr = get_max_eob(eob_max); +} #if CONFIG_VP9_HIGHBITDEPTH - vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1)); - vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1)); -#else - store_s16q_to_tran_low( - dqcoeff_ptr, - vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1))); -#endif +static VPX_FORCE_INLINE uint16x4_t +highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, int32x4_t v_round_s32) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32); + // const int abs_qcoeff = (int)((tmp * quant) >> 16); + const int32x4_t v_abs_qcoeff = vqdmulhq_s32(v_tmp, v_quant_s32); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + const int32x4_t v_abs_dqcoeff = vmulq_s32(v_abs_qcoeff, v_dequant_s32); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Packed nz_qcoeff_mask. Used to find eob. + return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0))); +} - eob_max = - vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); +void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int16x4_t v_zero = vdup_n_s16(0); + const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_round = vld1_s16(round_ptr); + int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); + int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); + int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + (void)scan; - iscan += 8; - coeff_ptr += 8; - qcoeff_ptr += 8; - dqcoeff_ptr += 8; - } + // DC and first 3 AC + v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + + // 4 more AC + v_mask_hi = + highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + + // Find the max lane eob for the first 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + n_coeffs -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + v_mask_hi = + highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + n_coeffs -= 8; + } while (n_coeffs); + + *eob_ptr = get_max_eob(v_eobmax); +} -#ifdef __aarch64__ - *eob_ptr = vmaxvq_u16(eob_max); -#else - { - const uint16x4_t eob_max_0 = - vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); - const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); - const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); - vst1_lane_u16(eob_ptr, eob_max_2, 0); - } -#endif // __aarch64__ - } +static VPX_FORCE_INLINE uint16x4_t +highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, int32x4_t v_round_s32) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) + const int32x4_t v_abs_coeff_scaled = vshlq_n_s32(v_abs_coeff, 2); + const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32); + // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 + const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32), + vreinterpretq_s32_u32(v_mask)); + // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale)); + const int32x4_t v_abs_qcoeff = + vqdmulhq_s32(vshlq_n_s32(v_tmp, 1), v_quant_s32); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + // vshlq_s32 will shift right if shift value is negative. + const int32x4_t v_abs_dqcoeff = + vshrq_n_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), 1); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Packed nz_qcoeff_mask. Used to find eob. + return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0))); } + +void vp9_highbd_quantize_fp_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_zero = vdup_n_s16(0); + const int16x4_t v_round = + vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14)); + int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); + int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); + int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + + (void)scan; + + // DC and first 3 AC + v_mask_lo = + highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + + // 4 more AC + v_mask_hi = + highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + + // Find the max lane eob for the first 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + n_coeffs -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = + highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + v_mask_hi = highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, + dqcoeff_ptr + 4, v_quant_s32, + v_dequant_s32, v_round_s32); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + n_coeffs -= 8; + } while (n_coeffs); + + *eob_ptr = get_max_eob(v_eobmax); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c index e336179e9..28ab10a13 100644 --- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -471,7 +471,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { cr->sb_index = i; cr->reduce_refresh = 0; if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) - if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1; + if (count_sel < (3 * count_tot) >> 2) cr->reduce_refresh = 1; } // Set cyclic refresh parameters. @@ -558,7 +558,7 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->percent_refresh = 10; cr->rate_ratio_qdelta = 1.5; cr->rate_boost_fac = 10; - if (cpi->refresh_golden_frame == 1) { + if (cpi->refresh_golden_frame == 1 && !cpi->use_svc) { cr->percent_refresh = 0; cr->rate_ratio_qdelta = 1.0; } diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c index 75bd097f2..a84c8b524 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libvpx/vp9/encoder/vp9_bitstream.c @@ -134,9 +134,9 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, const TOKENEXTRA *p; const vp9_extra_bit *const extra_bits = #if CONFIG_VP9_HIGHBITDEPTH - (bit_depth == VPX_BITS_12) - ? vp9_extra_bits_high12 - : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 : vp9_extra_bits; + (bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12 + : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 + : vp9_extra_bits; #else vp9_extra_bits; (void)bit_depth; diff --git a/libvpx/vp9/encoder/vp9_denoiser.c b/libvpx/vp9/encoder/vp9_denoiser.c index 2885223b5..77d72396a 100644 --- a/libvpx/vp9/encoder/vp9_denoiser.c +++ b/libvpx/vp9/encoder/vp9_denoiser.c @@ -233,7 +233,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( frame == ALTREF_FRAME || (frame == GOLDEN_FRAME && use_gf_temporal_ref) || (frame != LAST_FRAME && - ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) || + ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) || denoiser->denoising_level >= kDenHigh))) { frame = LAST_FRAME; ctx->newmv_sse = ctx->zeromv_lastref_sse; @@ -764,8 +764,9 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, int temporal_layer_id) { if (noise_level >= kDenLow && abs_sumdiff < 5) - return threshold *= - (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6; + return threshold *= (noise_level == kDenLow) ? 2 + : (temporal_layer_id == 2) ? 10 + : 6; else return threshold; } diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c index a9f392bf5..1483ac069 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libvpx/vp9/encoder/vp9_encodeframe.c @@ -1299,7 +1299,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // the reference (base layer frame) is key frame (i.e., is_key_frame == 1). int is_key_frame = (frame_is_intra_only(cm) || - (is_one_pass_cbr_svc(cpi) && + (is_one_pass_svc(cpi) && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); // Always use 4x4 partition for key frame. const int use_4x4_partition = frame_is_intra_only(cm); @@ -1406,7 +1406,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, assert(yv12 != NULL); - if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) || + if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) || cpi->svc.use_gf_temporal_ref_current_layer) { // For now, GOLDEN will not be used for non-zero spatial layers, since // it may not be a temporal reference. @@ -3413,7 +3413,8 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, const VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; - const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_buffer(cpi, ref); + YV12_BUFFER_CONFIG *yv12; + YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); const int step_param = 1; const MvLimits tmp_mv_limits = x->mv_limits; const SEARCH_METHODS search_method = NSTEP; @@ -3422,6 +3423,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, MV best_mv = { 0, 0 }; int cost_list[5]; + if (scaled_ref_frame) + yv12 = scaled_ref_frame; + else + yv12 = get_ref_frame_buffer(cpi, ref); + assert(yv12 != NULL); if (!yv12) return; vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, @@ -5381,7 +5387,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile, assert(yv12 != NULL); - if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) || + if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) || cpi->svc.use_gf_temporal_ref_current_layer) { // For now, GOLDEN will not be used for non-zero spatial layers, since // it may not be a temporal reference. diff --git a/libvpx/vp9/encoder/vp9_encoder.c b/libvpx/vp9/encoder/vp9_encoder.c index d3f4d1ea8..b66fdc0bc 100644 --- a/libvpx/vp9/encoder/vp9_encoder.c +++ b/libvpx/vp9/encoder/vp9_encoder.c @@ -1333,7 +1333,7 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a // target of 1/4x1/4. number_spatial_layers must be greater than 2. - if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc && + if (is_one_pass_svc(cpi) && !cpi->svc.scaled_temp_is_alloc && cpi->svc.number_spatial_layers > 2) { cpi->svc.scaled_temp_is_alloc = 1; if (vpx_realloc_frame_buffer( @@ -1511,7 +1511,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { // Temporal scalability. cpi->svc.number_temporal_layers = oxcf->ts_number_layers; - if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || + if ((cpi->svc.number_temporal_layers > 1) || ((cpi->svc.number_temporal_layers > 1 || cpi->svc.number_spatial_layers > 1) && cpi->oxcf.pass != 1)) { @@ -1527,6 +1527,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { init_buffer_indices(cpi); vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); + cpi->fixed_qp_onepass = 0; } void vp9_check_reset_rc_flag(VP9_COMP *cpi) { @@ -2077,7 +2078,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { rc->rc_2_frame = 0; } - if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || + if ((cpi->svc.number_temporal_layers > 1) || ((cpi->svc.number_temporal_layers > 1 || cpi->svc.number_spatial_layers > 1) && cpi->oxcf.pass != 1)) { @@ -3263,7 +3264,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { vp9_denoiser_update_ref_frame(cpi); #endif - if (is_one_pass_cbr_svc(cpi)) vp9_svc_update_ref_frame(cpi); + if (is_one_pass_svc(cpi)) vp9_svc_update_ref_frame(cpi); } static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { @@ -3857,11 +3858,11 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, int q = 0, bottom_index = 0, top_index = 0; int no_drop_scene_change = 0; const INTERP_FILTER filter_scaler = - (is_one_pass_cbr_svc(cpi)) + (is_one_pass_svc(cpi)) ? svc->downsample_filter_type[svc->spatial_layer_id] : EIGHTTAP; const int phase_scaler = - (is_one_pass_cbr_svc(cpi)) + (is_one_pass_svc(cpi)) ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0; @@ -3882,7 +3883,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, set_frame_size(cpi); - if (is_one_pass_cbr_svc(cpi) && + if (is_one_pass_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 2 && cpi->un_scaled_source->y_height == cm->height << 2 && svc->scaled_temp.y_width == cm->width << 1 && @@ -3896,7 +3897,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp, filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); svc->scaled_one_half = 1; - } else if (is_one_pass_cbr_svc(cpi) && + } else if (is_one_pass_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 1 && cpi->un_scaled_source->y_height == cm->height << 1 && svc->scaled_one_half) { @@ -3911,7 +3912,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, } #ifdef OUTPUT_YUV_SVC_SRC // Write out at most 3 spatial layers. - if (is_one_pass_cbr_svc(cpi) && svc->spatial_layer_id < 3) { + if (is_one_pass_svc(cpi) && svc->spatial_layer_id < 3) { vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source); } #endif @@ -4020,14 +4021,14 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (vp9_rc_drop_frame(cpi)) return 0; } - // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame + // For 1 pass SVC, only ZEROMV is allowed for spatial reference frame // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can // avoid this frame-level upsampling (for non intra_only frames). // For SVC single_layer mode, dynamic resize is allowed and we need to // scale references for this case. if (frame_is_intra_only(cm) == 0 && ((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) || - !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref))) { + !(is_one_pass_svc(cpi) && svc->force_zero_mode_spatial_ref))) { vp9_scale_references(cpi); } @@ -4367,7 +4368,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest int frame_over_shoot_limit; int frame_under_shoot_limit; int q = 0, q_low = 0, q_high = 0; - int last_q_attempt = 0; int enable_acl; #ifdef AGGRESSIVE_VBR int qrange_adj = 1; @@ -4381,8 +4381,18 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest // Maximal frame size allowed by the external rate control. // case: 0, we ignore the max frame size limit, and encode with the qindex // passed in by the external rate control model. - // case: -1, we take VP9's decision for the max frame size. + // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex + // and may recode if undershoot/overshoot is seen. + // If the external qindex is not VPX_DEFAULT_Q, we force no recode. + // case: -1, we take libvpx's decision for the max frame size, as well as + // the recode decision. + // Otherwise: if a specific size is given, libvpx's recode decision + // will respect the given size. int ext_rc_max_frame_size = 0; + // Use VP9's decision of qindex. This flag is in use only in external rate + // control model to help determine whether to recode when + // |ext_rc_max_frame_size| is 0. + int ext_rc_use_default_q = 1; const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth; #if CONFIG_RATE_CTRL @@ -4491,7 +4501,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest } } #endif // CONFIG_RATE_CTRL - if (cpi->ext_ratectrl.ready && !ext_rc_recode) { + if (cpi->ext_ratectrl.ready && !ext_rc_recode && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { vpx_codec_err_t codec_status; const GF_GROUP *gf_group = &cpi->twopass.gf_group; vpx_rc_encodeframe_decision_t encode_frame_decision; @@ -4500,16 +4511,27 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); + // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. + // index 1 refers to the first encoding frame in a gf group. + // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. + // See function define_gf_group_structure(). + const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; get_ref_frame_bufs(cpi, ref_frame_bufs); codec_status = vp9_extrc_get_encodeframe_decision( &cpi->ext_ratectrl, curr_frame_buf->frame_index, cm->current_frame_coding_index, gf_group->index, update_type, - ref_frame_bufs, ref_frame_flags, &encode_frame_decision); + gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags, + &encode_frame_decision); if (codec_status != VPX_CODEC_OK) { vpx_internal_error(&cm->error, codec_status, "vp9_extrc_get_encodeframe_decision() failed"); } - q = encode_frame_decision.q_index; + // If the external model recommends a reserved value, we use + // libvpx's default q. + if (encode_frame_decision.q_index != VPX_DEFAULT_Q) { + q = encode_frame_decision.q_index; + ext_rc_use_default_q = 0; + } ext_rc_max_frame_size = encode_frame_decision.max_frame_size; } @@ -4551,8 +4573,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; } - if (cpi->ext_ratectrl.ready) { - last_q_attempt = q; + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { // In general, for the external rate control, we take the qindex provided // as input and encode the frame with this qindex faithfully. However, // in some extreme scenarios, the provided qindex leads to a massive @@ -4560,20 +4582,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest // to pick a new qindex and recode the frame. We return the new qindex // through the API to the external model. if (ext_rc_max_frame_size == 0) { - break; + if (!ext_rc_use_default_q) break; } else if (ext_rc_max_frame_size == -1) { - if (rc->projected_frame_size < rc->max_frame_bandwidth) { - break; - } + // Do nothing, fall back to libvpx's recode decision. } else { - if (rc->projected_frame_size < ext_rc_max_frame_size) { - break; - } + // Change the max frame size, used in libvpx's recode decision. + rc->max_frame_bandwidth = ext_rc_max_frame_size; } - rc->max_frame_bandwidth = ext_rc_max_frame_size; - // If the current frame size exceeds the ext_rc_max_frame_size, - // we adjust the worst qindex to meet the frame size constraint. - q_high = 255; ext_rc_recode = 1; } #if CONFIG_RATE_CTRL @@ -4776,23 +4791,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest rc->projected_frame_size < rc->max_frame_bandwidth) loop = 0; - // Special handling of external max frame size constraint - if (ext_rc_recode) { - // If the largest q is not able to meet the max frame size limit, - // do nothing. - if (rc->projected_frame_size > ext_rc_max_frame_size && - last_q_attempt == 255) { - break; - } - // If VP9's q selection leads to a smaller q, we force it to use - // a larger q to better approximate the external max frame size - // constraint. - if (rc->projected_frame_size > ext_rc_max_frame_size && - q <= last_q_attempt) { - q = VPXMIN(255, last_q_attempt + 1); - } - } - if (loop) { ++loop_count; ++loop_at_this_size; @@ -5518,6 +5516,32 @@ static void encode_frame_to_data_rate( save_encode_params(cpi); } #endif + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) { + vpx_codec_err_t codec_status; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + const int ref_frame_flags = get_ref_frame_flags(cpi); + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); + // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. + // index 1 refers to the first encoding frame in a gf group. + // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. + // See function define_gf_group_structure(). + const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; + int ext_rdmult = VPX_DEFAULT_RDMULT; + get_ref_frame_bufs(cpi, ref_frame_bufs); + codec_status = vp9_extrc_get_frame_rdmult( + &cpi->ext_ratectrl, curr_frame_buf->frame_index, + cm->current_frame_coding_index, gf_group->index, update_type, + gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags, + &ext_rdmult); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_frame_rdmult() failed"); + } + cpi->ext_ratectrl.ext_rdmult = ext_rdmult; + } if (cpi->sf.recode_loop == DISALLOW_RECODE) { if (!encode_without_recode_loop(cpi, size, dest)) return; @@ -5593,7 +5617,7 @@ static void encode_frame_to_data_rate( // build the bitstream vp9_pack_bitstream(cpi, dest, size); - { + if (cpi->ext_ratectrl.ready) { const RefCntBuffer *coded_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result( @@ -5800,16 +5824,6 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags, ENCODE_FRAME_RESULT *encode_frame_result) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; - - if (cpi->common.current_frame_coding_index == 0) { - VP9_COMMON *cm = &cpi->common; - const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( - &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); - if (codec_status != VPX_CODEC_OK) { - vpx_internal_error(&cm->error, codec_status, - "vp9_extrc_send_firstpass_stats() failed"); - } - } #if CONFIG_MISMATCH_DEBUG mismatch_move_frame_idx_w(); #endif @@ -7626,8 +7640,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, const int gf_group_index = cpi->twopass.gf_group.index; int i; - if (is_one_pass_cbr_svc(cpi)) { - vp9_one_pass_cbr_svc_start_layer(cpi); + if (is_one_pass_svc(cpi)) { + vp9_one_pass_svc_start_layer(cpi); } vpx_usec_timer_start(&cmptimer); @@ -7647,7 +7661,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Normal defaults cm->reset_frame_context = 0; cm->refresh_frame_context = 1; - if (!is_one_pass_cbr_svc(cpi)) { + if (!is_one_pass_svc(cpi)) { cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 0; cpi->refresh_alt_ref_frame = 0; @@ -7780,7 +7794,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, adjust_frame_rate(cpi, source); } - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { vp9_update_temporal_layer_framerate(cpi); vp9_restore_layer_context(cpi); } @@ -7914,12 +7928,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } // Save layer specific state. - if (is_one_pass_cbr_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 || - cpi->svc.number_spatial_layers > 1) && - oxcf->pass == 2)) { + if (is_one_pass_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + oxcf->pass == 2)) { vp9_save_layer_context(cpi); } + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->fixed_qp_onepass = 0; + vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); @@ -7928,7 +7945,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #if CONFIG_INTERNAL_STATS - if (oxcf->pass != 1) { + if (oxcf->pass != 1 && !cpi->last_frame_dropped) { double samples = 0.0; cpi->bytes += (int)(*size); @@ -8090,7 +8107,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { if (cm->show_frame) { ++cpi->svc.spatial_layer_to_encode; if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) @@ -8159,9 +8176,11 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, unsigned int height) { VP9_COMMON *cm = &cpi->common; #if CONFIG_VP9_HIGHBITDEPTH - update_initial_width(cpi, cm->use_highbitdepth, 1, 1); + update_initial_width(cpi, cm->use_highbitdepth, cpi->common.subsampling_x, + cpi->common.subsampling_y); #else - update_initial_width(cpi, 0, 1, 1); + update_initial_width(cpi, 0, cpi->common.subsampling_x, + cpi->common.subsampling_y); #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_TEMPORAL_DENOISING diff --git a/libvpx/vp9/encoder/vp9_encoder.h b/libvpx/vp9/encoder/vp9_encoder.h index 1d5894525..cca8b53f8 100644 --- a/libvpx/vp9/encoder/vp9_encoder.h +++ b/libvpx/vp9/encoder/vp9_encoder.h @@ -971,6 +971,8 @@ typedef struct VP9_COMP { RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES]; #endif EXT_RATECTRL ext_ratectrl; + + int fixed_qp_onepass; } VP9_COMP; #if CONFIG_RATE_CTRL @@ -1305,7 +1307,7 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required( void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); -static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) { +static INLINE int is_one_pass_svc(const struct VP9_COMP *const cpi) { return (cpi->use_svc && cpi->oxcf.pass == 0); } diff --git a/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/libvpx/vp9/encoder/vp9_ext_ratectrl.c index 9f0098ab5..1d440442b 100644 --- a/libvpx/vp9/encoder/vp9_ext_ratectrl.c +++ b/libvpx/vp9/encoder/vp9_ext_ratectrl.c @@ -137,19 +137,21 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { vpx_codec_err_t vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, - FRAME_UPDATE_TYPE update_type, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, vpx_rc_encodeframe_decision_t *encode_frame_decision) { if (ext_ratectrl == NULL) { return VPX_CODEC_INVALID_PARAM; } - if (ext_ratectrl->ready) { + if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) { vpx_rc_status_t rc_status; vpx_rc_encodeframe_info_t encode_frame_info; encode_frame_info.show_index = show_index; encode_frame_info.coding_index = coding_index; encode_frame_info.gop_index = gop_index; encode_frame_info.frame_type = extrc_get_frame_type(update_type); + encode_frame_info.gop_size = gop_size; + encode_frame_info.use_alt_ref = use_alt_ref; vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, encode_frame_info.ref_frame_coding_indexes, @@ -198,3 +200,62 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( } return VPX_CODEC_OK; } + +vpx_codec_err_t vp9_extrc_get_gop_decision( + EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, + vpx_rc_gop_decision_t *gop_decision) { + vpx_rc_status_t rc_status; + if (ext_ratectrl == NULL || !ext_ratectrl->ready || + (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) { + return VPX_CODEC_INVALID_PARAM; + } + rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, + gop_info, gop_decision); + if (gop_decision->use_alt_ref) { + const int arf_constraint = + gop_decision->gop_coding_frames >= gop_info->min_gf_interval && + gop_decision->gop_coding_frames < gop_info->lag_in_frames; + if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR; + } + // TODO(chengchen): Take min and max gf interval from the model + // and overwrite libvpx's decision so that we can get rid + // of one of the checks here. + if (gop_decision->gop_coding_frames > gop_info->frames_to_key || + gop_decision->gop_coding_frames - gop_decision->use_alt_ref > + gop_info->max_gf_interval) { + return VPX_CODEC_ERROR; + } + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_get_frame_rdmult( + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + int *rdmult) { + vpx_rc_status_t rc_status; + vpx_rc_encodeframe_info_t encode_frame_info; + if (ext_ratectrl == NULL || !ext_ratectrl->ready || + (ext_ratectrl->funcs.rc_type & VPX_RC_RDMULT) == 0) { + return VPX_CODEC_INVALID_PARAM; + } + encode_frame_info.show_index = show_index; + encode_frame_info.coding_index = coding_index; + encode_frame_info.gop_index = gop_index; + encode_frame_info.frame_type = extrc_get_frame_type(update_type); + encode_frame_info.gop_size = gop_size; + encode_frame_info.use_alt_ref = use_alt_ref; + + vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, + encode_frame_info.ref_frame_coding_indexes, + encode_frame_info.ref_frame_valid_list); + rc_status = ext_ratectrl->funcs.get_frame_rdmult(ext_ratectrl->model, + &encode_frame_info, rdmult); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + return VPX_CODEC_OK; +} diff --git a/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/libvpx/vp9/encoder/vp9_ext_ratectrl.h index 74fd68b96..7c3875883 100644 --- a/libvpx/vp9/encoder/vp9_ext_ratectrl.h +++ b/libvpx/vp9/encoder/vp9_ext_ratectrl.h @@ -16,6 +16,7 @@ typedef struct EXT_RATECTRL { int ready; + int ext_rdmult; vpx_rc_model_t model; vpx_rc_funcs_t funcs; vpx_rc_config_t ratectrl_config; @@ -35,7 +36,7 @@ vpx_codec_err_t vp9_extrc_send_firstpass_stats( vpx_codec_err_t vp9_extrc_get_encodeframe_decision( EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, - FRAME_UPDATE_TYPE update_type, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, vpx_rc_encodeframe_decision_t *encode_frame_decision); @@ -45,4 +46,14 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result( const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth, uint32_t input_bit_depth, const int actual_encoding_qindex); +vpx_codec_err_t vp9_extrc_get_gop_decision( + EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info, + vpx_rc_gop_decision_t *gop_decision); + +vpx_codec_err_t vp9_extrc_get_frame_rdmult( + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + int *rdmult); + #endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c index 67302ed03..e9250e25c 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libvpx/vp9/encoder/vp9_firstpass.c @@ -2113,11 +2113,10 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, } // Clamp odd edge cases. - total_group_bits = (total_group_bits < 0) - ? 0 - : (total_group_bits > twopass->kf_group_bits) - ? twopass->kf_group_bits - : total_group_bits; + total_group_bits = (total_group_bits < 0) ? 0 + : (total_group_bits > twopass->kf_group_bits) + ? twopass->kf_group_bits + : total_group_bits; // Clip based on user supplied data rate variability limit. if (total_group_bits > (int64_t)max_bits * gop_frames) @@ -2714,6 +2713,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { // frame in which case it will already have been done. if (is_key_frame == 0) { vp9_zero(twopass->gf_group); + ++rc->gop_global_index; + } else { + rc->gop_global_index = 0; } vpx_clear_system_state(); @@ -2751,6 +2753,37 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { } } #endif + // If the external rate control model for GOP is used, the gop decisions + // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref| + // will be overwritten. + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) { + vpx_codec_err_t codec_status; + vpx_rc_gop_decision_t gop_decision; + vpx_rc_gop_info_t gop_info; + gop_info.min_gf_interval = rc->min_gf_interval; + gop_info.max_gf_interval = rc->max_gf_interval; + gop_info.active_min_gf_interval = active_gf_interval.min; + gop_info.active_max_gf_interval = active_gf_interval.max; + gop_info.allow_alt_ref = allow_alt_ref; + gop_info.is_key_frame = is_key_frame; + gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active; + gop_info.frames_since_key = rc->frames_since_key; + gop_info.frames_to_key = rc->frames_to_key; + gop_info.lag_in_frames = cpi->oxcf.lag_in_frames; + gop_info.show_index = cm->current_video_frame; + gop_info.coding_index = cm->current_frame_coding_index; + gop_info.gop_global_index = rc->gop_global_index; + + codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info, + &gop_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_gop_decision() failed"); + } + gop_coding_frames = gop_decision.gop_coding_frames; + use_alt_ref = gop_decision.use_alt_ref; + } // Was the group length constrained by the requirement for a new KF? rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0; @@ -3461,6 +3494,16 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { FIRSTPASS_STATS this_frame; const int show_idx = cm->current_video_frame; + if (cpi->common.current_frame_coding_index == 0) { + VP9_COMMON *cm = &cpi->common; + const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( + &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_send_firstpass_stats() failed"); + } + } + if (!twopass->stats_in) return; // Configure image size specific vizier parameters diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c index 697c589ab..579b466ca 100644 --- a/libvpx/vp9/encoder/vp9_pickmode.c +++ b/libvpx/vp9/encoder/vp9_pickmode.c @@ -268,6 +268,7 @@ static void block_variance(const uint8_t *src, int src_stride, #endif uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) { int i, j, k = 0; + uint32_t k_sqr = 0; *sse = 0; *sum = 0; @@ -305,7 +306,8 @@ static void block_variance(const uint8_t *src, int src_stride, #endif *sse += sse8x8[k]; *sum += sum8x8[k]; - var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6); + k_sqr = (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6); + var8x8[k] = sse8x8[k] > k_sqr ? sse8x8[k] - k_sqr : k_sqr - sse8x8[k]; k++; } } @@ -319,6 +321,7 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size, const int nw = 1 << (bw - b_width_log2_lookup[unit_size]); const int nh = 1 << (bh - b_height_log2_lookup[unit_size]); int i, j, k = 0; + uint32_t k_sqr = 0; for (i = 0; i < nh; i += 2) { for (j = 0; j < nw; j += 2) { @@ -326,9 +329,10 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size, sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1]; sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] + sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1]; - var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> - (b_width_log2_lookup[unit_size] + - b_height_log2_lookup[unit_size] + 6)); + k_sqr = (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> + (b_width_log2_lookup[unit_size] + + b_height_log2_lookup[unit_size] + 6)); + var_o[k] = sse_o[k] > k_sqr ? sse_o[k] - k_sqr : k_sqr - sse_o[k]; k++; } } @@ -452,6 +456,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, unsigned int var8x8[64] = { 0 }; TX_SIZE tx_size; int i, k; + uint32_t sum_sqr; #if CONFIG_VP9_HIGHBITDEPTH const vpx_bit_depth_t bd = cpi->common.bit_depth; #endif @@ -463,7 +468,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, cpi->common.use_highbitdepth, bd, #endif sse8x8, sum8x8, var8x8); - var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4)); + sum_sqr = (uint32_t)((int64_t)sum * sum) >> (bw + bh + 4); + var = sse > sum_sqr ? sse - sum_sqr : sum_sqr - sse; *var_y = var; *sse_y = sse; @@ -1112,7 +1118,7 @@ static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh, } static INLINE void update_thresh_freq_fact_row_mt( - VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance, + VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance, int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx, PREDICTION_MODE mode) { THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; @@ -1627,9 +1633,9 @@ static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x, return -1; // Exit NEWMV search if base_mv_sse is large. - if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale)) + if (sf->base_mv_aggressive && (base_mv_sse >> scale) > best_sse_sofar) return -1; - if (base_mv_sse < (best_sse_sofar << 1)) { + if ((base_mv_sse >> 1) < best_sse_sofar) { // Base layer mv is good. // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since // (0, 0) mode is already tested. diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c index 9058997b0..dcc44449f 100644 --- a/libvpx/vp9/encoder/vp9_quantize.c +++ b/libvpx/vp9/encoder/vp9_quantize.c @@ -149,34 +149,6 @@ void vp9_highbd_quantize_fp_32x32_c( } #endif -void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, - const int16_t *scan, const int16_t *iscan) { - MACROBLOCKD *const xd = &x->e_mbd; - struct macroblock_plane *p = &x->plane[plane]; - struct macroblockd_plane *pd = &xd->plane[plane]; - tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block), - *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const int n_coeffs = 4 * 4; - - if (x->skip_block) { - memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff)); - memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff)); - return; - } - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, - p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, &p->eobs[block], scan, iscan); - return; - } -#endif - vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, - &p->eobs[block], scan, iscan); -} - static void invert_quant(int16_t *quant, int16_t *shift, int d) { unsigned t; int l, m; diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h index 2e6d7da2b..f626f0656 100644 --- a/libvpx/vp9/encoder/vp9_quantize.h +++ b/libvpx/vp9/encoder/vp9_quantize.h @@ -37,9 +37,6 @@ typedef struct { DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); } QUANTS; -void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, - const int16_t *scan, const int16_t *iscan); - struct VP9_COMP; struct VP9Common; diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c index 085297391..d9207f7a2 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/libvpx/vp9/encoder/vp9_ratectrl.c @@ -327,7 +327,7 @@ static void update_buffer_level_postencode(VP9_COMP *cpi, rc->buffer_level = rc->bits_off_target; - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size); } } @@ -910,7 +910,7 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) { active_worst_quality = curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] << 1; } else { - if (!rc->is_src_frame_alt_ref && + if (!rc->is_src_frame_alt_ref && !cpi->use_svc && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { active_worst_quality = curr_frame == 1 @@ -1871,7 +1871,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } } } else { - if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) || + if ((cpi->use_svc) || (!rc->is_src_frame_alt_ref && !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { rc->last_q[INTER_FRAME] = qindex; @@ -2021,6 +2021,11 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { (rc->baseline_gf_interval + af_ratio - 1) : ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) / (rc->baseline_gf_interval + af_ratio - 1); + // For SVC: refresh flags are used to define the pattern, so we can't + // use that for boosting the target size here. + // TODO(marpan): Consider adding internal boost on TL0 for VBR-SVC. + // For now just use the CBR logic for setting target size. + if (cpi->use_svc) target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); if (target > INT_MAX) target = INT_MAX; return vp9_rc_clamp_pframe_target_size(cpi, (int)target); } @@ -2147,7 +2152,7 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { } else { target = rc->avg_frame_bandwidth; } - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { // Note that for layers, avg_frame_bandwidth is the cumulative // per-frame-bandwidth. For the target size of this frame, use the // layer average frame size (i.e., non-cumulative per-frame-bw). @@ -2282,7 +2287,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) { cm->frame_type = KEY_FRAME; rc->source_alt_ref_active = 0; - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1); layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, svc->number_temporal_layers); @@ -2290,11 +2295,14 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); // Assumption here is that LAST_FRAME is being updated for a keyframe. // Thus no change in update flags. - target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); } } else { cm->frame_type = INTER_FRAME; - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { LAYER_CONTEXT *lc = &svc->layer_context[layer]; // Add condition current_video_frame > 0 for the case where first frame // is intra only followed by overlay/copy frame. In this case we don't @@ -2303,7 +2311,23 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { (svc->spatial_layer_id == 0 && cm->current_video_frame > 0) ? 0 : svc->layer_context[svc->temporal_layer_id].is_key_frame; - target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) { + target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); + } else { + double rate_err = 0.0; + rc->fac_active_worst_inter = 140; + rc->fac_active_worst_gf = 100; + if (rc->rolling_target_bits > 0) { + rate_err = + (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits; + if (rate_err < 1.0) + rc->fac_active_worst_inter = 120; + else if (rate_err > 2.0) + // Increase active_worst faster if rate fluctuation is high. + rc->fac_active_worst_inter = 160; + } + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); + } } } @@ -2312,7 +2336,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { svc->layer_context[layer].is_key_frame == 1) { cm->frame_type = KEY_FRAME; cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); - target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); } // Set the buffer idx and refresh flags for key frames in simulcast mode. // Note the buffer slot for long-term reference is set below (line 2255), @@ -2397,7 +2424,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { } if (svc->set_intra_only_frame) { set_intra_only_frame(cpi); - target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); } // Overlay frame predicts from LAST (intra-only) if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG; @@ -2584,13 +2614,12 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, const uint32_t pic_breadth = VPXMAX(cpi->common.width, cpi->common.height); int i; - for (i = LEVEL_1; i < LEVEL_MAX; ++i) { + for (i = 0; i < VP9_LEVELS; ++i) { if (vp9_level_defs[i].max_luma_picture_size >= pic_size && vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { if (rc->min_gf_interval <= (int)vp9_level_defs[i].min_altref_distance) { - rc->min_gf_interval = - (int)vp9_level_defs[i].min_altref_distance + 1; + rc->min_gf_interval = (int)vp9_level_defs[i].min_altref_distance; rc->max_gf_interval = VPXMAX(rc->max_gf_interval, rc->min_gf_interval); } diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h index 83a12cde7..96a8fd3f1 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/libvpx/vp9/encoder/vp9_ratectrl.h @@ -211,6 +211,10 @@ typedef struct { // Flag to constrain golden frame interval on key frame frequency for 1 pass // VBR. int constrain_gf_key_freq_onepass_vbr; + + // The index of the current GOP. Start from zero. + // When a key frame is inserted, it resets to zero. + int gop_global_index; } RATE_CONTROL; struct VP9_COMP; diff --git a/libvpx/vp9/encoder/vp9_rd.c b/libvpx/vp9/encoder/vp9_rd.c index 9fa3ff186..58dd75b44 100644 --- a/libvpx/vp9/encoder/vp9_rd.c +++ b/libvpx/vp9/encoder/vp9_rd.c @@ -244,6 +244,12 @@ int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { // largest dc_quant is 21387, therefore rdmult should fit in int32_t int rdmult = q * q; + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) { + return cpi->ext_ratectrl.ext_rdmult; + } + // Make sure this function is floating point safe. vpx_clear_system_state(); @@ -287,6 +293,11 @@ static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) { int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex); + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) { + return cpi->ext_ratectrl.ext_rdmult; + } return modulate_rdmult(cpi, rdmult); } @@ -567,6 +578,12 @@ void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], } } +// Disable gcc 12.2 false positive warning. +// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-overflow" +#endif void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd, ENTROPY_CONTEXT t_above[16], @@ -604,6 +621,9 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, break; } } +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) { diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c index 0171a0572..a464ce38f 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libvpx/vp9/encoder/vp9_rdopt.c @@ -1108,6 +1108,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, xd->mi[0]->tx_size = TX_4X4; + assert(!x->skip_block); + #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { for (mode = DC_PRED; mode <= TM_PRED; ++mode) { @@ -1135,7 +1137,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); - tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; xd->mi[0]->bmi[block].as_mode = mode; vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst, @@ -1148,7 +1153,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_highbd_fwht4x4(src_diff, coeff, 8); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, so->scan, so->iscan); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0); @@ -1166,7 +1173,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, vpx_highbd_fdct4x4(src_diff, coeff, 8); else vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, + eob, so->scan, so->iscan); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); distortion += vp9_highbd_block_error_dispatch( @@ -1236,7 +1245,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); - tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; xd->mi[0]->bmi[block].as_mode = mode; vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst, x->skip_encode ? src_stride : dst_stride, dst, @@ -1248,7 +1260,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fwht4x4(src_diff, coeff, 8); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + so->scan, so->iscan); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; @@ -1263,7 +1277,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fht4x4(src_diff, coeff, 8, tx_type); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + so->scan, so->iscan); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; @@ -1640,6 +1656,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, const int is_compound = has_second_ref(mi); const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; + assert(!x->skip_block); + for (ref = 0; ref < 1 + is_compound; ++ref) { const int bw = b_width_log2_lookup[BLOCK_8X8]; const int h = 4 * (i >> bw); @@ -1701,18 +1719,27 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8; #endif int64_t ssz, rd, rd1, rd2; - tran_low_t *coeff; + tran_low_t *coeff, *qcoeff, *dqcoeff; + uint16_t *eob; int coeff_ctx; k += (idy * 2 + idx); coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]); coeff = BLOCK_OFFSET(p->coeff, k); + qcoeff = BLOCK_OFFSET(p->qcoeff, k); + dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k); + eob = &p->eobs[k]; + x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), coeff, 8); - vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan); #if CONFIG_VP9_HIGHBITDEPTH + vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, + so->scan, so->iscan); thisdistortion += vp9_highbd_block_error_dispatch( coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd); #else + vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan); thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -3242,6 +3269,7 @@ int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) { // For two pass account for any formatting bars detected. if (cpi->oxcf.pass == 2) { TWO_PASS *twopass = &cpi->twopass; + vpx_clear_system_state(); // The inactive region is specified in MBs not mi units. // The image edge is in the following MB row. @@ -3269,6 +3297,7 @@ int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) { // For two pass account for any formatting bars detected. if (cpi->oxcf.pass == 2) { TWO_PASS *twopass = &cpi->twopass; + vpx_clear_system_state(); // The inactive region is specified in MBs not mi units. // The image edge is in the following MB row. @@ -3470,7 +3499,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } mode_skip_mask[INTRA_FRAME] |= - ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); + (uint16_t) ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0; diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c index a163297e6..d75488a8e 100644 --- a/libvpx/vp9/encoder/vp9_segmentation.c +++ b/libvpx/vp9/encoder/vp9_segmentation.c @@ -39,7 +39,7 @@ void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data, } void vp9_disable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { - seg->feature_mask[segment_id] &= ~(1 << feature_id); + seg->feature_mask[segment_id] &= ~(1u << feature_id); } void vp9_clear_segdata(struct segmentation *seg, int segment_id, diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c index a57a70ab1..7e9435fb5 100644 --- a/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -290,7 +290,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, } static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) { - if (is_one_pass_cbr_svc(cpi)) + if (is_one_pass_svc(cpi)) return &cpi->svc.layer_context[cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id]; @@ -354,7 +354,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { cpi->alt_ref_source = lc->alt_ref_source; // Check if it is one_pass_cbr_svc mode and lc->speed > 0 (real-time mode // does not use speed = 0). - if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) { + if (is_one_pass_svc(cpi) && lc->speed > 0) { cpi->oxcf.speed = lc->speed; } cpi->loopfilter_ctrl = lc->loopfilter_ctrl; @@ -754,7 +754,7 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG); } -int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { +int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) { int width = 0, height = 0; SVC *const svc = &cpi->svc; LAYER_CONTEXT *lc = NULL; @@ -894,6 +894,10 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { RATE_CONTROL *const lrc = &lc->rc; lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q); lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q); + if (cpi->fixed_qp_onepass) { + lrc->worst_quality = cpi->rc.worst_quality; + lrc->best_quality = cpi->rc.best_quality; + } } if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 && diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libvpx/vp9/encoder/vp9_svc_layercontext.h index b2d1d1b98..c7328cf57 100644 --- a/libvpx/vp9/encoder/vp9_svc_layercontext.h +++ b/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -255,7 +255,7 @@ int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi); void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi); -int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi); +int vp9_one_pass_svc_start_layer(struct VP9_COMP *const cpi); void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi); diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c index 701bb8928..8af30c42a 100644 --- a/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -777,16 +777,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, // Assign higher weight to matching MB if it's error // score is lower. If not applying MC default behavior // is to weight all MBs equal. - blk_fw[0] = err < (thresh_low << THR_SHIFT) - ? 2 - : err < (thresh_high << THR_SHIFT) ? 1 : 0; + blk_fw[0] = err < (thresh_low << THR_SHIFT) ? 2 + : err < (thresh_high << THR_SHIFT) ? 1 + : 0; blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0]; } else { use_32x32 = 0; for (k = 0; k < 4; k++) - blk_fw[k] = blk_bestsme[k] < thresh_low - ? 2 - : blk_bestsme[k] < thresh_high ? 1 : 0; + blk_fw[k] = blk_bestsme[k] < thresh_low ? 2 + : blk_bestsme[k] < thresh_high ? 1 + : 0; } for (k = 0; k < 4; k++) { diff --git a/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c index 4fa24512c..a7f5117cf 100644 --- a/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c +++ b/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c @@ -191,13 +191,11 @@ static INLINE void highbd_read_chroma_dist_row_8( } static void vp9_highbd_apply_temporal_filter_luma_8( - const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, - int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, - int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, - int uv_pre_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum, - uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist, - const uint32_t *v_dist, const uint32_t *const *neighbors_first, + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_first, const uint32_t *const *neighbors_second, int top_weight, int bottom_weight) { const int rounding = (1 << strength) >> 1; @@ -256,17 +254,12 @@ static void vp9_highbd_apply_temporal_filter_luma_8( highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, y_accum); - y_src += y_src_stride; y_pre += y_pre_stride; y_count += y_pre_stride; y_accum += y_pre_stride; y_dist += DIST_STRIDE; - u_src += uv_src_stride; - u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; - v_pre += uv_pre_stride; v_dist += DIST_STRIDE; // Then all the rows except the last one @@ -300,11 +293,7 @@ static void vp9_highbd_apply_temporal_filter_luma_8( highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, &v_second); - u_src += uv_src_stride; - u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; - v_pre += uv_pre_stride; v_dist += DIST_STRIDE; } @@ -320,7 +309,6 @@ static void vp9_highbd_apply_temporal_filter_luma_8( highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, y_accum); - y_src += y_src_stride; y_pre += y_pre_stride; y_count += y_pre_stride; y_accum += y_pre_stride; @@ -364,13 +352,10 @@ static void vp9_highbd_apply_temporal_filter_luma_8( // Perform temporal filter for the luma component. static void vp9_highbd_apply_temporal_filter_luma( - const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, - int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, - int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, - int uv_pre_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, - uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist, - const uint32_t *u_dist, const uint32_t *v_dist) { + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { unsigned int blk_col = 0, uv_blk_col = 0; const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; const unsigned int mid_width = block_width >> 1, @@ -384,9 +369,7 @@ static void vp9_highbd_apply_temporal_filter_luma( neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; vp9_highbd_apply_temporal_filter_luma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y, + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, bottom_weight); @@ -399,13 +382,10 @@ static void vp9_highbd_apply_temporal_filter_luma( for (; blk_col < mid_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_highbd_apply_temporal_filter_luma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step, - block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, - y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, - v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, - bottom_weight); + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); } if (!use_whole_blk) { @@ -417,21 +397,16 @@ static void vp9_highbd_apply_temporal_filter_luma( for (; blk_col < last_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_highbd_apply_temporal_filter_luma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, - u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step, - block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, - y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, - v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, - bottom_weight); + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); } // Right neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; vp9_highbd_apply_temporal_filter_luma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y, + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, bottom_weight); @@ -491,13 +466,11 @@ static INLINE void highbd_add_luma_dist_to_8_chroma_mod( // blk_fw as an array of size 4 for the weights for each of the 4 subblocks, // else use top_weight for top half, and bottom weight for bottom half. static void vp9_highbd_apply_temporal_filter_chroma_8( - const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, - int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, - int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, - int uv_pre_stride, unsigned int uv_block_width, - unsigned int uv_block_height, int ss_x, int ss_y, int strength, - uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, - const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int uv_block_width, unsigned int uv_block_height, int ss_x, + int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist, const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, int top_weight, int bottom_weight, const int *blk_fw) { const int rounding = (1 << strength) >> 1; @@ -565,10 +538,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8( highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, v_accum); - u_src += uv_src_stride; u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; v_pre += uv_pre_stride; v_dist += DIST_STRIDE; u_count += uv_pre_stride; @@ -576,8 +547,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8( v_count += uv_pre_stride; v_accum += uv_pre_stride; - y_src += y_src_stride * (1 + ss_y); - y_pre += y_pre_stride * (1 + ss_y); y_dist += DIST_STRIDE * (1 + ss_y); // Then all the rows except the last one @@ -649,10 +618,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8( highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, v_accum); - u_src += uv_src_stride; u_pre += uv_pre_stride; u_dist += DIST_STRIDE; - v_src += uv_src_stride; v_pre += uv_pre_stride; v_dist += DIST_STRIDE; u_count += uv_pre_stride; @@ -660,8 +627,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8( v_count += uv_pre_stride; v_accum += uv_pre_stride; - y_src += y_src_stride * (1 + ss_y); - y_pre += y_pre_stride * (1 + ss_y); y_dist += DIST_STRIDE * (1 + ss_y); } @@ -720,12 +685,10 @@ static void vp9_highbd_apply_temporal_filter_chroma_8( // Perform temporal filter for the chroma components. static void vp9_highbd_apply_temporal_filter_chroma( - const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, - int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, - int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, - int uv_pre_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, - uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { const unsigned int uv_width = block_width >> ss_x, uv_height = block_height >> ss_y; @@ -755,8 +718,6 @@ static void vp9_highbd_apply_temporal_filter_chroma( if (use_whole_blk) { vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, @@ -764,8 +725,6 @@ static void vp9_highbd_apply_temporal_filter_chroma( neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); } else { vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, @@ -789,13 +748,11 @@ static void vp9_highbd_apply_temporal_filter_chroma( } vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, - strength, u_accum + uv_blk_col, u_count + uv_blk_col, - v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, - u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd, - top_weight, bottom_weight, NULL); + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); blk_col += blk_col_step; uv_blk_col += uv_blk_col_step; @@ -812,8 +769,6 @@ static void vp9_highbd_apply_temporal_filter_chroma( for (; uv_blk_col < uv_mid_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, @@ -830,8 +785,6 @@ static void vp9_highbd_apply_temporal_filter_chroma( for (; uv_blk_col < uv_last_width; blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, @@ -849,13 +802,11 @@ static void vp9_highbd_apply_temporal_filter_chroma( } vp9_highbd_apply_temporal_filter_chroma_8( - y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, - u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, - v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, - strength, u_accum + uv_blk_col, u_count + uv_blk_col, - v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, - u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd, - top_weight, bottom_weight, NULL); + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); } void vp9_highbd_apply_temporal_filter_sse4_1( @@ -929,14 +880,12 @@ void vp9_highbd_apply_temporal_filter_sse4_1( u_dist_ptr = u_dist + 1; v_dist_ptr = v_dist + 1; - vp9_highbd_apply_temporal_filter_luma( - y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, - u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, - strength, blk_fw, use_whole_blk, y_accum, y_count, y_dist_ptr, u_dist_ptr, - v_dist_ptr); + vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, + block_height, ss_x, ss_y, strength, + blk_fw, use_whole_blk, y_accum, y_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); vp9_highbd_apply_temporal_filter_chroma( - y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); diff --git a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c index 2188903b1..e9943447f 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -111,7 +111,7 @@ static void fadst4_sse2(__m128i *in) { const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8]; __m128i in7 = _mm_add_epi16(in[0], in[1]); @@ -424,7 +424,7 @@ static void fadst8_sse2(__m128i *in) { const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__const_0 = _mm_setzero_si128(); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; @@ -1056,7 +1056,7 @@ static void fadst16_8col(__m128i *in) { const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); u[0] = _mm_unpacklo_epi16(in[15], in[0]); u[1] = _mm_unpackhi_epi16(in[15], in[0]); diff --git a/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c index fcf50eb2a..0e04a2f41 100644 --- a/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ b/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c @@ -76,9 +76,9 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); - const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); + const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); - const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); + const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); @@ -96,14 +96,14 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); - const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); + const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; - __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); + __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; @@ -300,7 +300,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, bmv = new_bmv; best_address = new_best_address; - v_bmv_w = _mm_set1_epi32(bmv.as_int); + v_bmv_w = _mm_set1_epi32((int)bmv.as_int); #if VPX_ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else diff --git a/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c index 7685e7bc3..bf0e8b121 100644 --- a/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -754,8 +754,8 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; - const int dst_uv_w = dst_w / 2; - const int dst_uv_h = dst_h / 2; + const int dst_uv_w = dst->uv_crop_width; + const int dst_uv_h = dst->uv_crop_height; int scaled = 0; // phase_scaler is usually 0 or 8. diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c index db18b1a7a..da285be8e 100644 --- a/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c @@ -18,7 +18,7 @@ #include "vpx_dsp/x86/quantize_sse2.h" // Zero fill 8 positions in the output buffer. -static INLINE void store_zero_tran_low(tran_low_t *a) { +static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) { const __m256i zero = _mm256_setzero_si256(); #if CONFIG_VP9_HIGHBITDEPTH _mm256_storeu_si256((__m256i *)(a), zero); @@ -28,22 +28,73 @@ static INLINE void store_zero_tran_low(tran_low_t *a) { #endif } -static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr, - __m256i *coeff256) { - const __m256i iscan = _mm256_loadu_si256(iscan_ptr); - const __m256i zero256 = _mm256_setzero_si256(); +static VPX_FORCE_INLINE void load_fp_values_avx2( + const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { + *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { #if CONFIG_VP9_HIGHBITDEPTH - // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as - // B1 A1 B0 A0. Shuffle to B1 B0 A1 A0 in order to scan eob correctly. - const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8); - const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256); + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); #else - const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256); + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); #endif - const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256); - // Add one to convert from indices to counts - const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0); - return _mm256_and_si256(iscan_plus_one, nzero_coeff0); + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) { + const __m256i eob_lo = eob256; + // Copy upper 128 to lower 128 + const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81); + __m256i eob = _mm256_max_epi16(eob_lo, eob_hi); + __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); +#if defined(_MSC_VER) && (_MSC_VER < 1910) + return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; +#else + return (uint16_t)_mm256_extract_epi16(eob, 0); +#endif +} + +static VPX_FORCE_INLINE void quantize_fp_16( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const int32_t nzflag = + _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr)); + + if (nzflag) { + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + } else { + store_zero_tran_low(qcoeff_ptr); + store_zero_tran_low(dqcoeff_ptr); + } } void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -51,10 +102,114 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - __m128i eob; - __m256i round256, quant256, dequant256; - __m256i eob256, thr256; + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + thr = _mm256_setzero_si256(); + + quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += 8 * 2; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_srai_epi16(dequant, 1); + + // AC only loop + while (n_coeffs < 0) { + quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += 8 * 2; + } + + *eob_ptr = get_max_eob(eob_max); +} + +// Enable this flag when matching the optimized code to +// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the +// existing ssse3 code and quantize_fp_32x32_nz_c(). +// +// #define MATCH_VP9_QUANTIZE_FP_32X32_C + +#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C +static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i abs_dqcoeff = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); + const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + (void)thr; +} +#endif + +static VPX_FORCE_INLINE void quantize_fp_32x32_16( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int32_t nzflag = _mm256_movemask_epi8(thr_mask); + + if (nzflag) { +#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask); +#else + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); +#endif + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i abs_dqcoeff = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); + const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + } else { + store_zero_tran_low(qcoeff_ptr); + store_zero_tran_low(dqcoeff_ptr); + } +} + +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); (void)scan; coeff_ptr += n_coeffs; @@ -63,74 +218,223 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; + // Setup global values + load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + thr = _mm256_srli_epi16(dequant, 2); + quant = _mm256_slli_epi16(quant, 1); { - __m256i coeff256; - - // Setup global values - { - const __m128i round = _mm_load_si128((const __m128i *)round_ptr); - const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr); - const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr); - round256 = _mm256_castsi128_si256(round); - round256 = _mm256_permute4x64_epi64(round256, 0x54); - - quant256 = _mm256_castsi128_si256(quant); - quant256 = _mm256_permute4x64_epi64(quant256, 0x54); - - dequant256 = _mm256_castsi128_si256(dequant); - dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54); - } - - { - __m256i qcoeff256; - __m256i qtmp256; - coeff256 = load_tran_low(coeff_ptr + n_coeffs); - qcoeff256 = _mm256_abs_epi16(coeff256); - qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); - qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); - qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); - store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs); - coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); - store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); - } - - eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256); - n_coeffs += 8 * 2; + const __m256i rnd = _mm256_set1_epi16((int16_t)1); + round = _mm256_add_epi16(round, rnd); + round = _mm256_srai_epi16(round, 1); } - // remove dc constants - dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31); - quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); - round256 = _mm256_permute2x128_si256(round256, round256, 0x31); +#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. + thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1)); + quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); +#else + quantize_fp_32x32_16_no_nzflag( + &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, + qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); +#endif + + n_coeffs += 8 * 2; - thr256 = _mm256_srai_epi16(dequant256, 1); + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_permute2x128_si256(thr, thr, 0x31); // AC only loop while (n_coeffs < 0) { - __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs); - __m256i qcoeff256 = _mm256_abs_epi16(coeff256); - int32_t nzflag = - _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256)); - - if (nzflag) { - __m256i qtmp256; - qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); - qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); - qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); - store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs); - coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); - store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); - eob256 = _mm256_max_epi16( - eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256)); - } else { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - } + quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); n_coeffs += 8 * 2; } - eob = _mm_max_epi16(_mm256_castsi256_si128(eob256), - _mm256_extracti128_si256(eob256, 1)); + *eob_ptr = get_max_eob(eob_max); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) { + const __m128i v = _mm_load_si128((const __m128i *)val_ptr); + const __m128i zero = _mm_setzero_si128(); + const __m128i dc = _mm_unpacklo_epi16(v, zero); + const __m128i ac = _mm_unpackhi_epi16(v, zero); + return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void highbd_load_fp_values( + const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) { + *round = highbd_init_256(round_ptr); + *quant = highbd_init_256(quant_ptr); + *dequant = highbd_init_256(dequant_ptr); +} + +static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( + const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) { + const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +static VPX_FORCE_INLINE void highbd_quantize_fp( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int step = 8; + __m256i round, quant, dequant; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } + + *eob_ptr = get_max_eob(eob_max); +} + +static VPX_FORCE_INLINE void highbd_quantize_fp_32x32( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr); + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + const int step = 8; + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + (void)scan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, + &dequant); + thr = _mm256_srli_epi32(dequant, 2); + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1)); + quant = _mm256_slli_epi32(quant, 1); + round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1); + + highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_permute2x128_si256(thr, thr, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp_32x32( + &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, + qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } - *eob_ptr = accumulate_eob(eob); + *eob_ptr = get_max_eob(eob_max); } +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c index 4bcadaa6a..c87723443 100644 --- a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c @@ -16,184 +16,110 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - __m128i zero; + const __m128i zero = _mm_setzero_si128(); __m128i thr; int nzflag; - __m128i eob; + int index = 16; __m128i round, quant, dequant; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i eob; (void)scan; - coeff_ptr += n_coeffs; - iscan += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - - { - __m128i coeff0, coeff1; - - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } + // Setup global values. + load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); thr = _mm_srai_epi16(dequant, 1); - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } else { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); - } - } + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); + __m128i eob0; + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); } - n_coeffs += 8 * 2; - } - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); + index += 16; } + + *eob_ptr = accumulate_eob(eob); } diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c new file mode 100644 index 000000000..d35004e37 --- /dev/null +++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <tmmintrin.h> + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" + +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i eob; + + (void)scan; + + // Setup global values. + load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + __m128i eob0; + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one_s16 = _mm_set1_epi16(1); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i eob; + + (void)scan; + + // Setup global values. + load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant); + // The 32x32 halves round. + round = _mm_add_epi16(round, one_s16); + round = _mm_srli_epi16(round, 1); + + // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so + // upshift quant to account for this. + quant = _mm_slli_epi16(quant, 1); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + // Get the abs value of qcoeff again so we can use shifts for division. + qcoeff0 = _mm_abs_epi16(qcoeff0); + qcoeff1 = _mm_abs_epi16(qcoeff1); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + // Divide by 2. + qcoeff0 = _mm_srli_epi16(qcoeff0, 1); + qcoeff1 = _mm_srli_epi16(qcoeff1, 1); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 2); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + // Get the abs value of qcoeff again so we can use shifts for division. + qcoeff0 = _mm_abs_epi16(qcoeff0); + qcoeff1 = _mm_abs_epi16(qcoeff1); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + // Divide by 2. + qcoeff0 = _mm_srli_epi16(qcoeff0, 1); + qcoeff1 = _mm_srli_epi16(qcoeff1, 1); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + if (nzflag) { + const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm deleted file mode 100644 index 680acfec6..000000000 --- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ /dev/null @@ -1,178 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" -%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" - -SECTION_RODATA -pw_1: times 8 dw 1 - -SECTION .text - -%macro QUANTIZE_FP 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \ - qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - - ; actual quantize loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant - mov r2, dequantmp -%ifidn %1, fp_32x32 - pcmpeqw m5, m5 - psrlw m5, 15 - paddw m1, m5 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - mova m3, [r2q] ; m3 = dequant - mov r3, qcoeffmp - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, fp_32x32 - psllw m2, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - - INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq - lea r5q, [r5q+ncoeffq*2] - INCREMENT_ELEMENTS_TRAN_LOW r3q, ncoeffq - INCREMENT_ELEMENTS_TRAN_LOW r4q, ncoeffq - neg ncoeffq - - ; get DC and first 15 AC coeffs - LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] - LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpeqw m7, m7 - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - STORE_TRAN_LOW 8, r3q, ncoeffq, 6, 11, 12 - STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 -%ifidn %1, fp_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; r4[i] = r3[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; r4[i] = r3[i] * q -%ifidn %1, fp_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 - psrlw m0, m3, 2 -%else - psrlw m0, m3, 1 -%endif - STORE_TRAN_LOW 8, r4q, ncoeffq, 6, 11, 12 - STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m7 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - jz .accumulate_eob - -.ac_only_loop: - LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] - LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - - pcmpgtw m7, m6, m0 - pcmpgtw m12, m11, m0 - pmovmskb r6d, m7 - pmovmskb r2d, m12 - - or r6, r2 - jz .skip_iter - - pcmpeqw m7, m7 - - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - STORE_TRAN_LOW 14, r3q, ncoeffq, 6, 11, 12 - STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 -%ifidn %1, fp_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; r4[i] = r3[i] * q - pmullw m13, m3 ; r4[i] = r3[i] * q -%ifidn %1, fp_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif - STORE_TRAN_LOW 14, r4q, ncoeffq, 6, 11, 12 - STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m7 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jl .ac_only_loop - - jmp .accumulate_eob -.skip_iter: - STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq - STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8 - STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq - STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8 - add ncoeffq, mmsize - jl .ac_only_loop - -.accumulate_eob: - ; horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - pextrw r6, m8, 0 - mov [r2], r6w - RET -%endmacro - -INIT_XMM ssse3 -QUANTIZE_FP fp, 7 -QUANTIZE_FP fp_32x32, 7 diff --git a/libvpx/vp9/ratectrl_rtc.cc b/libvpx/vp9/ratectrl_rtc.cc index f4d7f7e9e..02e50a857 100644 --- a/libvpx/vp9/ratectrl_rtc.cc +++ b/libvpx/vp9/ratectrl_rtc.cc @@ -158,6 +158,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { } vp9_set_mb_mi(cm, cm->width, cm->height); cm->frame_type = frame_params.frame_type; + // This is needed to ensure key frame does not get unset in rc_get_svc_params. + cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0; cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; cpi_->sf.use_nonrd_pick_mode = 1; if (cpi_->svc.number_spatial_layers == 1 && @@ -205,12 +207,16 @@ int VP9RateControlRTC::GetLoopfilterLevel() const { return lf->filter_level; } -signed char *VP9RateControlRTC::GetCyclicRefreshMap() const { - return cpi_->cyclic_refresh->map; -} +bool VP9RateControlRTC::GetSegmentationData( + VP9SegmentationData *segmentation_data) const { + if (!cpi_->cyclic_refresh->apply_cyclic_refresh) return false; -int *VP9RateControlRTC::GetDeltaQ() const { - return cpi_->cyclic_refresh->qindex_delta; + segmentation_data->segmentation_map = cpi_->segmentation_map; + segmentation_data->segmentation_map_size = + cpi_->common.mi_cols * cpi_->common.mi_rows; + segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta; + segmentation_data->delta_q_size = 3u; + return true; } void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { diff --git a/libvpx/vp9/ratectrl_rtc.h b/libvpx/vp9/ratectrl_rtc.h index d2b9417ae..b209e4db6 100644 --- a/libvpx/vp9/ratectrl_rtc.h +++ b/libvpx/vp9/ratectrl_rtc.h @@ -58,6 +58,13 @@ struct VP9FrameParamsQpRTC { int temporal_layer_id; }; +struct VP9SegmentationData { + const uint8_t *segmentation_map; + size_t segmentation_map_size; + const int *delta_q; + size_t delta_q_size; +}; + // This interface allows using VP9 real-time rate control without initializing // the encoder. To use this interface, you need to link with libvpxrc.a. // @@ -110,8 +117,7 @@ class VP9RateControlRTC { // GetQP() needs to be called after ComputeQP() to get the latest QP int GetQP() const; int GetLoopfilterLevel() const; - signed char *GetCyclicRefreshMap() const; - int *GetDeltaQ() const; + bool GetSegmentationData(VP9SegmentationData *segmentation_data) const; void ComputeQP(const VP9FrameParamsQpRTC &frame_params); // Feedback to rate control with the size of current encoded frame void PostEncodeUpdate(uint64_t encoded_frame_size); diff --git a/libvpx/vp9/simple_encode.cc b/libvpx/vp9/simple_encode.cc index 654699e1b..f42912d35 100644 --- a/libvpx/vp9/simple_encode.cc +++ b/libvpx/vp9/simple_encode.cc @@ -744,10 +744,12 @@ static void UpdateGroupOfPicture(const VP9_COMP *cpi, int start_coding_index, } #define SET_STRUCT_VALUE(config, structure, ret, field) \ - if (strcmp(config.name, #field) == 0) { \ - structure->field = atoi(config.value); \ - ret = 1; \ - } + do { \ + if (strcmp(config.name, #field) == 0) { \ + structure->field = atoi(config.value); \ + ret = 1; \ + } \ + } while (false) static void UpdateEncodeConfig(const EncodeConfig &config, VP9EncoderConfig *oxcf) { diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c index 05ac9e169..dee175dc0 100644 --- a/libvpx/vp9/vp9_cx_iface.c +++ b/libvpx/vp9/vp9_cx_iface.c @@ -170,8 +170,8 @@ static vpx_codec_err_t update_error_state( static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg, const struct vp9_extracfg *extra_cfg) { - RANGE_CHECK(cfg, g_w, 1, 65535); // 16 bits available - RANGE_CHECK(cfg, g_h, 1, 65535); // 16 bits available + RANGE_CHECK(cfg, g_w, 1, 65536); // 16 bits available + RANGE_CHECK(cfg, g_h, 1, 65536); // 16 bits available RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000); RANGE_CHECK_HI(cfg, g_profile, 3); @@ -1014,6 +1014,7 @@ static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args); + if (ctx->cpi->fixed_qp_onepass) extra_cfg.aq_mode = 0; return update_extra_cfg(ctx, &extra_cfg); } @@ -1357,8 +1358,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, unsigned int lib_flags = 0; YV12_BUFFER_CONFIG sd; int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts); - int64_t dst_end_time_stamp = - timebase_units_to_ticks(timestamp_ratio, pts + duration); size_t size, cx_data_sz; unsigned char *cx_data; @@ -1369,6 +1368,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1; if (img != NULL) { + const int64_t dst_end_time_stamp = + timebase_units_to_ticks(timestamp_ratio, pts + duration); res = image2yuvconfig(img, &sd); // Store the original flags in to the frame buffer. Will extract the @@ -1405,6 +1406,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // compute first pass stats if (img) { int ret; + int64_t dst_end_time_stamp; vpx_codec_cx_pkt_t fps_pkt; ENCODE_FRAME_RESULT encode_frame_result; vp9_init_encode_frame_result(&encode_frame_result); @@ -1430,6 +1432,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, #endif // !CONFIG_REALTIME_ONLY } else { ENCODE_FRAME_RESULT encode_frame_result; + int64_t dst_end_time_stamp; vp9_init_encode_frame_result(&encode_frame_result); while (cx_data_sz >= ctx->cx_data_sz / 2 && -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data, @@ -1525,9 +1528,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data += size; cx_data_sz -= size; - if (is_one_pass_cbr_svc(cpi) && - (cpi->svc.spatial_layer_id == - cpi->svc.number_spatial_layers - 1)) { + if (is_one_pass_svc(cpi) && (cpi->svc.spatial_layer_id == + cpi->svc.number_spatial_layers - 1)) { // Encoded all spatial layers; exit loop. break; } @@ -1950,6 +1952,24 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_quantizer_one_pass(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const int qp = va_arg(args, int); + vpx_codec_enc_cfg_t *cfg = &ctx->cfg; + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + vpx_codec_err_t res; + + if (qp < 0 || qp > 63) return VPX_CODEC_INVALID_PARAM; + + cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp; + extra_cfg.aq_mode = 0; + cpi->fixed_qp_onepass = 1; + + res = update_extra_cfg(ctx, &extra_cfg); + return res; +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -2004,6 +2024,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter }, { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl }, { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control }, + { VP9E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass }, // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c index 3c42c7dfe..bdfe21793 100644 --- a/libvpx/vp9/vp9_dx_iface.c +++ b/libvpx/vp9/vp9_dx_iface.c @@ -334,7 +334,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, void *user_priv, long deadline) { const uint8_t *data_start = data; - const uint8_t *const data_end = data + data_sz; vpx_codec_err_t res; uint32_t frame_sizes[8]; int frame_count; @@ -362,6 +361,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, // Decode in serial mode. if (frame_count > 0) { + const uint8_t *const data_end = data + data_sz; int i; for (i = 0; i < frame_count; ++i) { @@ -379,6 +379,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, data_start += frame_size; } } else { + const uint8_t *const data_end = data + data_sz; while (data_start < data_end) { const uint32_t frame_size = (uint32_t)(data_end - data_start); const vpx_codec_err_t res = diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk index 92a7fddb9..9072628f2 100644 --- a/libvpx/vp9/vp9cx.mk +++ b/libvpx/vp9/vp9cx.mk @@ -111,8 +111,10 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c @@ -121,10 +123,6 @@ endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm -ifeq ($(VPX_ARCH_X86_64),yes) -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm -endif - VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c diff --git a/libvpx/vpx/vp8cx.h b/libvpx/vpx/vp8cx.h index a61238cb1..e0b679fbb 100644 --- a/libvpx/vpx/vp8cx.h +++ b/libvpx/vpx/vp8cx.h @@ -757,6 +757,16 @@ enum vp8e_enc_control_id { * Supported in codecs: VP8 */ VP8E_SET_RTC_EXTERNAL_RATECTRL, + + /*!\brief Codec control to set quantizer for the next frame. + * + * This will turn off cyclic refresh. Only applicable to 1-pass without + * spatial layers. + * + * Supported in codecs: VP9 + * + */ + VP9E_SET_QUANTIZER_ONE_PASS, }; /*!\brief vpx 1-D scaling mode @@ -1085,6 +1095,8 @@ VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *) #define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int) #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL +VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int) +#define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS /*!\endcond */ /*! @} - end defgroup vp8_encoder */ diff --git a/libvpx/vpx/vpx_encoder.h b/libvpx/vpx/vpx_encoder.h index 21254bb54..efaf5ef36 100644 --- a/libvpx/vpx/vpx_encoder.h +++ b/libvpx/vpx/vpx_encoder.h @@ -115,14 +115,14 @@ typedef int64_t vpx_codec_pts_t; * support frame types that are codec specific (MPEG-1 D-frames for example) */ typedef uint32_t vpx_codec_frame_flags_t; -#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */ +#define VPX_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */ /*!\brief frame can be dropped without affecting the stream (no future frame * depends on this one) */ -#define VPX_FRAME_IS_DROPPABLE 0x2 +#define VPX_FRAME_IS_DROPPABLE 0x2u /*!\brief frame should be decoded but will not be shown */ -#define VPX_FRAME_IS_INVISIBLE 0x4 +#define VPX_FRAME_IS_INVISIBLE 0x4u /*!\brief this is a fragment of the encoded frame */ -#define VPX_FRAME_IS_FRAGMENT 0x8 +#define VPX_FRAME_IS_FRAGMENT 0x8u /*!\brief Error Resilient flags * @@ -132,12 +132,13 @@ typedef uint32_t vpx_codec_frame_flags_t; */ typedef uint32_t vpx_codec_er_flags_t; /*!\brief Improve resiliency against losses of whole frames */ -#define VPX_ERROR_RESILIENT_DEFAULT 0x1 +#define VPX_ERROR_RESILIENT_DEFAULT 0x1u /*!\brief The frame partitions are independently decodable by the bool decoder, * meaning that partitions can be decoded even though earlier partitions have * been lost. Note that intra prediction is still done over the partition - * boundary. */ -#define VPX_ERROR_RESILIENT_PARTITIONS 0x2 + * boundary. + * \note This is only supported by VP8.*/ +#define VPX_ERROR_RESILIENT_PARTITIONS 0x2u /*!\brief Encoder output packet variants * diff --git a/libvpx/vpx/vpx_ext_ratectrl.h b/libvpx/vpx/vpx_ext_ratectrl.h index a193e5595..3c5fc8cfc 100644 --- a/libvpx/vpx/vpx_ext_ratectrl.h +++ b/libvpx/vpx/vpx_ext_ratectrl.h @@ -25,7 +25,27 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures. */ -#define VPX_EXT_RATECTRL_ABI_VERSION (1) +#define VPX_EXT_RATECTRL_ABI_VERSION (6) + +/*!\brief The control type of the inference API. + * In VPX_RC_QP mode, the external rate control model determines the + * quantization parameter (QP) for each frame. + * In VPX_RC_GOP mode, the external rate control model determines the + * group of picture (GOP) of the video sequence. + * In VPX_RC_RDMULT mode, the external rate control model determines the + * rate-distortion multiplier (rdmult) for the current frame. + * In VPX_RC_GOP_QP mode, the external rate control model determines + * both the QP and the GOP. + * In VPX_RC_GOP_QP_RDMULT mode, the external rate control model determines + * the QP, GOP and the rdmult. + */ +typedef enum vpx_rc_type { + VPX_RC_QP = 1 << 0, + VPX_RC_GOP = 1 << 1, + VPX_RC_RDMULT = 1 << 2, + VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP, + VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT +} vpx_rc_type_t; /*!\brief Abstract rate control model handler * @@ -34,11 +54,27 @@ extern "C" { */ typedef void *vpx_rc_model_t; +/*!\brief A reserved value for the q index. + * If the external rate control model returns this value, + * the encoder will use the default q selected by libvpx's rate control + * system. + */ +#define VPX_DEFAULT_Q -1 + +/*!\brief A reserved value for the rdmult. + * If the external rate control model returns this value, + * the encoder will use the default rdmult selected by libvpx's rate control + * system. + */ +#define VPX_DEFAULT_RDMULT -1 + /*!\brief Encode frame decision made by the external rate control model * * The encoder will receive the decision from the external rate control model * through get_encodeframe_decision() defined in vpx_rc_funcs_t. * + * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q. + * * If max_frame_size = 0, the encoding ignores max frame size limit. * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit. * If the encoded frame size is larger than max_frame_size, the frame is @@ -67,7 +103,7 @@ typedef struct vpx_rc_encodeframe_info { int show_index; /**< display index, starts from zero*/ int coding_index; /**< coding index, starts from zero*/ /*! - * index in group of picture, starts from zero. + * index of the current frame in this group of picture, starts from zero. */ int gop_index; int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/ @@ -77,6 +113,14 @@ typedef struct vpx_rc_encodeframe_info { * 1: Valid */ int ref_frame_valid_list[3]; + /*! + * The length of the current GOP. + */ + int gop_size; + /*! + * Whether the current GOP uses an alt ref. + */ + int use_alt_ref; } vpx_rc_encodeframe_info_t; /*!\brief Frame coding result @@ -258,6 +302,84 @@ typedef struct vpx_rc_config { int frame_rate_den; /**< denominator of frame rate */ } vpx_rc_config_t; +/*!\brief Information passed to the external rate control model to + * help make GOP decisions. + */ +typedef struct vpx_rc_gop_info { + /*! + * Minimum allowed gf interval, fixed for the whole clip. + * Note that it will be modified to match vp9's level constraints + * in the encoder. + * The level constraint is defined in vp9_encoder.c: + * const Vp9LevelSpec vp9_level_defs[VP9_LEVELS]. + */ + int min_gf_interval; + /*! + * Maximum allowed gf interval, fixed for the whole clip. + */ + int max_gf_interval; + /*! + * Minimum allowed gf interval for the current GOP, determined + * by the encoder. + */ + int active_min_gf_interval; + /*! + * Maximum allowed gf interval for the current GOP, determined + * by the encoder. + */ + int active_max_gf_interval; + /*! + * Whether to allow the use of alt ref, determined by the encoder. + * It is fixed for the entire encode. + * See function "is_altref_enabled" in vp9_encoder.h. + */ + int allow_alt_ref; + /*! + * Is the current frame a key frame. + */ + int is_key_frame; + /*! + * Does the previous gop use alt ref or not. + */ + int last_gop_use_alt_ref; + /*! + * Current frame distance to the last keyframe, e.g., if Nth frame is a key, + * then the value of the N+1 th frame is 1. + */ + int frames_since_key; + /*! + * Current frame distance to the next keyframe, e.g. if Nth frame is a key, + * then the value of frame N - 1 is 1. + */ + int frames_to_key; + /*! + * Number of lookahead source frames. + */ + int lag_in_frames; + /*! + * Display index (temporal stamp) of this frame in the whole clip, + * starts from zero. + */ + int show_index; + /*! + * Coding index of this frame in the whole clip, starts from zero. + */ + int coding_index; + /*! + * The index of the current gop, starts from zero, resets to zero + * when a keyframe is set. + */ + int gop_global_index; +} vpx_rc_gop_info_t; + +/*!\brief The decision made by the external rate control model to set the + * group of picture. + */ +typedef struct vpx_rc_gop_decision { + int gop_coding_frames; /**< The number of frames of this GOP */ + int use_alt_ref; /**< Whether to use alt ref for this GOP */ +} vpx_rc_gop_decision_t; + /*!\brief Create an external rate control model callback prototype * * This callback is invoked by the encoder to create an external rate control @@ -310,6 +432,32 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)( vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_result_t *encode_frame_result); +/*!\brief Get the GOP structure from the external rate control model. + * + * This callback is invoked by the encoder to get GOP decisions from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] gop_info information collected from the encoder + * \param[out] gop_decision GOP decision from the model + */ +typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info, + vpx_rc_gop_decision_t *gop_decision); + +/*!\brief Get the frame rdmult from the external rate control model. + * + * This callback is invoked by the encoder to get rdmult from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] frame_info information collected from the encoder + * \param[out] rdmult frame rate-distortion multiplier from the model + */ +typedef vpx_rc_status_t (*vpx_rc_get_frame_rdmult_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *frame_info, + int *rdmult); + /*!\brief Delete the external rate control model callback prototype * * This callback is invoked by the encoder to delete the external rate control @@ -328,6 +476,10 @@ typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)( */ typedef struct vpx_rc_funcs { /*! + * The rate control type of this API. + */ + vpx_rc_type_t rc_type; + /*! * Create an external rate control model. */ vpx_rc_create_model_cb_fn_t create_model; @@ -344,6 +496,14 @@ typedef struct vpx_rc_funcs { */ vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result; /*! + * Get GOP decisions from the external rate control model. + */ + vpx_rc_get_gop_decision_cb_fn_t get_gop_decision; + /*! + * Get rdmult for the frame from the external rate control model. + */ + vpx_rc_get_frame_rdmult_cb_fn_t get_frame_rdmult; + /*! * Delete the external rate control model. */ vpx_rc_delete_model_cb_fn_t delete_model; diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/libvpx/vpx_dsp/arm/fdct16x16_neon.c index 67f43246a..a458ecaa4 100644 --- a/libvpx/vpx_dsp/arm/fdct16x16_neon.c +++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.c @@ -35,22 +35,23 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp3[16]; // Left half. - load(input, stride, temp0); - cross_input(temp0, temp1, 0); - vpx_fdct16x16_body(temp1, temp0); + load_cross(input, stride, temp0); + scale_input(temp0, temp1); + vpx_fdct8x16_body(temp1, temp0); // Right half. - load(input + 8, stride, temp1); - cross_input(temp1, temp2, 0); - vpx_fdct16x16_body(temp2, temp1); + load_cross(input + 8, stride, temp1); + scale_input(temp1, temp2); + vpx_fdct8x16_body(temp2, temp1); // Transpose top left and top right quarters into one contiguous location to // process to the top half. - transpose_8x8(&temp0[0], &temp2[0]); - transpose_8x8(&temp1[0], &temp2[8]); + + transpose_s16_8x8_new(&temp0[0], &temp2[0]); + transpose_s16_8x8_new(&temp1[0], &temp2[8]); partial_round_shift(temp2); - cross_input(temp2, temp3, 1); - vpx_fdct16x16_body(temp3, temp2); + cross_input(temp2, temp3); + vpx_fdct8x16_body(temp3, temp2); transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], &temp2[5], &temp2[6], &temp2[7]); transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], @@ -61,12 +62,13 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. - transpose_8x8(&temp0[8], &temp1[0]); + transpose_s16_8x8_new(&temp0[8], &temp1[0]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); partial_round_shift(temp1); - cross_input(temp1, temp0, 1); - vpx_fdct16x16_body(temp0, temp1); + cross_input(temp1, temp0); + vpx_fdct8x16_body(temp0, temp1); transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], &temp1[5], &temp1[6], &temp1[7]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], @@ -74,5 +76,58 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { store(output, temp1); store(output + 8, temp1 + 8); } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[16]; + int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16], + right3[16], right4[16]; + + // Left half. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); + + // right half. + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); + + // Transpose top left and top right quarters into one contiguous location to + // process to the top half. + + transpose_s32_8x8_2(left1, right1, left3, right3); + transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8); + + highbd_partial_round_shift(left3, right3); + highbd_cross_input(left3, right3, left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); + + // Transpose bottom left and bottom right quarters into one contiguous + // location to process to the bottom half. + + highbd_partial_round_shift(left4, right4); + highbd_cross_input(left4, right4, left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); + + transpose_s32_8x8_2(left1, right1, left3, right3); + transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8); + store16_s32(output, left3); + output += 4; + store16_s32(output, right3); + output += 4; + + store16_s32(output, left4); + output += 4; + store16_s32(output, right4); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.h b/libvpx/vpx_dsp/arm/fdct16x16_neon.h index 0dd21153f..43d820b6b 100644 --- a/libvpx/vpx_dsp/arm/fdct16x16_neon.h +++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.h @@ -13,6 +13,8 @@ #include <arm_neon.h> +#include "fdct_neon.h" + static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { b[0] = vld1q_s16(a); a += stride; @@ -72,45 +74,67 @@ static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { // To maybe reduce register usage this could be combined with the load() step to // get the first 4 and last 4 values, cross those, then load the middle 8 values // and cross them. +static INLINE void scale_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/) { + b[0] = vshlq_n_s16(a[0], 2); + b[1] = vshlq_n_s16(a[1], 2); + b[2] = vshlq_n_s16(a[2], 2); + b[3] = vshlq_n_s16(a[3], 2); + b[4] = vshlq_n_s16(a[4], 2); + b[5] = vshlq_n_s16(a[5], 2); + b[6] = vshlq_n_s16(a[6], 2); + b[7] = vshlq_n_s16(a[7], 2); + + b[8] = vshlq_n_s16(a[8], 2); + b[9] = vshlq_n_s16(a[9], 2); + b[10] = vshlq_n_s16(a[10], 2); + b[11] = vshlq_n_s16(a[11], 2); + b[12] = vshlq_n_s16(a[12], 2); + b[13] = vshlq_n_s16(a[13], 2); + b[14] = vshlq_n_s16(a[14], 2); + b[15] = vshlq_n_s16(a[15], 2); +} + static INLINE void cross_input(const int16x8_t *a /*[16]*/, - int16x8_t *b /*[16]*/, const int pass) { - if (pass == 0) { - b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2); - b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2); - b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2); - b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2); - b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2); - b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2); - b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2); - b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2); - - b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2); - b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2); - b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2); - b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2); - b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2); - b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2); - b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2); - b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2); - } else { - b[0] = vaddq_s16(a[0], a[15]); - b[1] = vaddq_s16(a[1], a[14]); - b[2] = vaddq_s16(a[2], a[13]); - b[3] = vaddq_s16(a[3], a[12]); - b[4] = vaddq_s16(a[4], a[11]); - b[5] = vaddq_s16(a[5], a[10]); - b[6] = vaddq_s16(a[6], a[9]); - b[7] = vaddq_s16(a[7], a[8]); - - b[8] = vsubq_s16(a[7], a[8]); - b[9] = vsubq_s16(a[6], a[9]); - b[10] = vsubq_s16(a[5], a[10]); - b[11] = vsubq_s16(a[4], a[11]); - b[12] = vsubq_s16(a[3], a[12]); - b[13] = vsubq_s16(a[2], a[13]); - b[14] = vsubq_s16(a[1], a[14]); - b[15] = vsubq_s16(a[0], a[15]); - } + int16x8_t *b /*[16]*/) { + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); +} + +static INLINE void load_cross(const int16_t *a, int stride, + int16x8_t *b /*[16]*/) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride)); + + b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride)); + b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride)); + b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride)); + b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride)); + b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride)); + b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride)); + b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride)); + b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride)); } // Quarter round at the beginning of the second pass. Can't use vrshr (rounding) @@ -135,84 +159,9 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); } -// fdct_round_shift((a +/- b) * c) -static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, - const tran_high_t c, int16x8_t *add, - int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// fdct_round_shift(a * c0 +/- b * c1) -static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, - const tran_coef_t c0, - const tran_coef_t c1, int16x8_t *add, - int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0); - const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1); - const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1); - const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0); - const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// Transpose 8x8 to a new location. Don't use transpose_neon.h because those -// are all in-place. -static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/, - int16x8_t *b /*[8]*/) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; -} - // Main body of fdct16x16. -static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, - int16x8_t *out /*[16]*/) { +static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/, + int16x8_t *out /*[16]*/) { int16x8_t s[8]; int16x8_t x[4]; int16x8_t step[8]; @@ -237,16 +186,17 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) - butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]); - // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); - butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]); + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]); // Stage 2 // Re-using source s5/s6 // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) - butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]); + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]); // Stage 3 x[0] = vaddq_s16(s[4], s[5]); @@ -255,12 +205,12 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, x[3] = vaddq_s16(s[7], s[6]); // Stage 4 - // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) - // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) - butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]); - // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) - // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) - butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]); + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]); // step 2 // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" @@ -272,8 +222,8 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) - butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]); - butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]); + butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]); // step 3 s[0] = vaddq_s16(in[8], s[3]); @@ -286,13 +236,15 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, s[7] = vaddq_s16(in[15], s[4]); // step 4 - // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64) - // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64) - butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]); + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]); // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) - // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64) - butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]); + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]); // step 5 step[0] = vaddq_s16(s[0], s[1]); @@ -305,23 +257,368 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/, step[7] = vaddq_s16(s[7], s[6]); // step 6 - // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64) - // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64) - // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64) - // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64) - // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64) - // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] * - // cospi_22_64) - // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64) - // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64) - butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9], + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9], &out[7]); - butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1], + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1], &out[15]); - butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13], + + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13], &out[3]); - butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5], + + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5], &out[11]); } +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/, + int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); + left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); + left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); + left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); + left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); + left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); + left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); + left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); + left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); + right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); +} + +static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/, + int32x4_t *a_right /*[16]*/, + int32x4_t *b_left /*[16]*/, + int32x4_t *b_right /*[16]*/) { + b_left[0] = vaddq_s32(a_left[0], a_left[15]); + b_left[1] = vaddq_s32(a_left[1], a_left[14]); + b_left[2] = vaddq_s32(a_left[2], a_left[13]); + b_left[3] = vaddq_s32(a_left[3], a_left[12]); + b_left[4] = vaddq_s32(a_left[4], a_left[11]); + b_left[5] = vaddq_s32(a_left[5], a_left[10]); + b_left[6] = vaddq_s32(a_left[6], a_left[9]); + b_left[7] = vaddq_s32(a_left[7], a_left[8]); + + b_right[0] = vaddq_s32(a_right[0], a_right[15]); + b_right[1] = vaddq_s32(a_right[1], a_right[14]); + b_right[2] = vaddq_s32(a_right[2], a_right[13]); + b_right[3] = vaddq_s32(a_right[3], a_right[12]); + b_right[4] = vaddq_s32(a_right[4], a_right[11]); + b_right[5] = vaddq_s32(a_right[5], a_right[10]); + b_right[6] = vaddq_s32(a_right[6], a_right[9]); + b_right[7] = vaddq_s32(a_right[7], a_right[8]); + + b_left[8] = vsubq_s32(a_left[7], a_left[8]); + b_left[9] = vsubq_s32(a_left[6], a_left[9]); + b_left[10] = vsubq_s32(a_left[5], a_left[10]); + b_left[11] = vsubq_s32(a_left[4], a_left[11]); + b_left[12] = vsubq_s32(a_left[3], a_left[12]); + b_left[13] = vsubq_s32(a_left[2], a_left[13]); + b_left[14] = vsubq_s32(a_left[1], a_left[14]); + b_left[15] = vsubq_s32(a_left[0], a_left[15]); + + b_right[8] = vsubq_s32(a_right[7], a_right[8]); + b_right[9] = vsubq_s32(a_right[6], a_right[9]); + b_right[10] = vsubq_s32(a_right[5], a_right[10]); + b_right[11] = vsubq_s32(a_right[4], a_right[11]); + b_right[12] = vsubq_s32(a_right[3], a_right[12]); + b_right[13] = vsubq_s32(a_right[2], a_right[13]); + b_right[14] = vsubq_s32(a_right[1], a_right[14]); + b_right[15] = vsubq_s32(a_right[0], a_right[15]); +} + +static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + const int32x4_t one = vdupq_n_s32(1); + left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2); + left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2); + left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2); + left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2); + left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2); + left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2); + left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2); + left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2); + left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2); + left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2); + left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2); + left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2); + left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2); + left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2); + left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2); + left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2); + + right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2); + right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2); + right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2); + right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2); + right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2); + right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2); + right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2); + right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2); + right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2); + right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2); + right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2); + right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2); + right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2); + right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2); + right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2); + right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2); +} + +// Store 16 32x4 vectors, assuming stride == 16. +static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) { + vst1q_s32(a, b[0]); + a += 16; + vst1q_s32(a, b[1]); + a += 16; + vst1q_s32(a, b[2]); + a += 16; + vst1q_s32(a, b[3]); + a += 16; + vst1q_s32(a, b[4]); + a += 16; + vst1q_s32(a, b[5]); + a += 16; + vst1q_s32(a, b[6]); + a += 16; + vst1q_s32(a, b[7]); + a += 16; + vst1q_s32(a, b[8]); + a += 16; + vst1q_s32(a, b[9]); + a += 16; + vst1q_s32(a, b[10]); + a += 16; + vst1q_s32(a, b[11]); + a += 16; + vst1q_s32(a, b[12]); + a += 16; + vst1q_s32(a, b[13]); + a += 16; + vst1q_s32(a, b[14]); + a += 16; + vst1q_s32(a, b[15]); +} + +// Main body of fdct8x16 column +static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + int32x4_t sl[8]; + int32x4_t sr[8]; + int32x4_t xl[4]; + int32x4_t xr[4]; + int32x4_t inl[8]; + int32x4_t inr[8]; + int32x4_t stepl[8]; + int32x4_t stepr[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + sl[0] = vaddq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sr[1] = vaddq_s32(right[1], right[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sr[2] = vaddq_s32(right[2], right[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sr[3] = vaddq_s32(right[3], right[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sr[5] = vsubq_s32(right[2], right[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sr[6] = vsubq_s32(right[1], right[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[7] = vsubq_s32(right[0], right[7]); + + // Copy values 8-15 as we're storing in-place + inl[0] = left[8]; + inr[0] = right[8]; + inl[1] = left[9]; + inr[1] = right[9]; + inl[2] = left[10]; + inr[2] = right[10]; + inl[3] = left[11]; + inr[3] = right[11]; + inl[4] = left[12]; + inr[4] = right[12]; + inl[5] = left[13]; + inr[5] = right[13]; + inl[6] = left[14]; + inr[6] = right[14]; + inl[7] = left[15]; + inr[7] = right[15]; + + // fdct4(step, step); + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[8], &right[8]); + + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[4], &right[4], + &left[12], &right[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6], + &sr[6], &sl[5], &sr[5]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], sl[5]); + xr[0] = vaddq_s32(sr[4], sr[5]); + xl[1] = vsubq_s32(sl[4], sl[5]); + xr[1] = vsubq_s32(sr[4], sr[5]); + xl[2] = vsubq_s32(sl[7], sl[6]); + xr[2] = vsubq_s32(sr[7], sr[6]); + xl[3] = vaddq_s32(sl[7], sl[6]); + xr[3] = vaddq_s32(sr[7], sr[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[2], &right[2], + &left[14], &right[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[10], &right[10], + &left[6], &right[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64, + &sl[5], &sr[5], &sl[2], &sr[2]); + butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64, + &sl[4], &sr[4], &sl[3], &sr[3]); + + // step 3 + sl[0] = vaddq_s32(inl[0], sl[3]); + sr[0] = vaddq_s32(inr[0], sr[3]); + sl[1] = vaddq_s32(inl[1], sl[2]); + sr[1] = vaddq_s32(inr[1], sr[2]); + xl[0] = vsubq_s32(inl[1], sl[2]); + xr[0] = vsubq_s32(inr[1], sr[2]); + xl[1] = vsubq_s32(inl[0], sl[3]); + xr[1] = vsubq_s32(inr[0], sr[3]); + xl[2] = vsubq_s32(inl[7], sl[4]); + xr[2] = vsubq_s32(inr[7], sr[4]); + xl[3] = vsubq_s32(inl[6], sl[5]); + xr[3] = vsubq_s32(inr[6], sr[5]); + sl[6] = vaddq_s32(inl[6], sl[5]); + sr[6] = vaddq_s32(inr[6], sr[5]); + sl[7] = vaddq_s32(inl[7], sl[4]); + sr[7] = vaddq_s32(inr[7], sr[4]); + + // step 4 + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64, + cospi_24_64, &sl[6], &sr[6], &sl[1], + &sr[1]); + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64, + cospi_8_64, &sl[2], &sr[2], &sl[5], + &sr[5]); + + // step 5 + stepl[0] = vaddq_s32(sl[0], sl[1]); + stepr[0] = vaddq_s32(sr[0], sr[1]); + stepl[1] = vsubq_s32(sl[0], sl[1]); + stepr[1] = vsubq_s32(sr[0], sr[1]); + stepl[2] = vaddq_s32(xl[1], sl[2]); + stepr[2] = vaddq_s32(xr[1], sr[2]); + stepl[3] = vsubq_s32(xl[1], sl[2]); + stepr[3] = vsubq_s32(xr[1], sr[2]); + stepl[4] = vsubq_s32(xl[2], sl[5]); + stepr[4] = vsubq_s32(xr[2], sr[5]); + stepl[5] = vaddq_s32(xl[2], sl[5]); + stepr[5] = vaddq_s32(xr[2], sr[5]); + stepl[6] = vsubq_s32(sl[7], sl[6]); + stepr[6] = vsubq_s32(sr[7], sr[6]); + stepl[7] = vaddq_s32(sl[7], sl[6]); + stepr[7] = vaddq_s32(sr[7], sr[6]); + + // step 6 + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/libvpx/vpx_dsp/arm/fdct32x32_neon.c index de74e6630..d6818d2ec 100644 --- a/libvpx/vpx_dsp/arm/fdct32x32_neon.c +++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.c @@ -15,6 +15,8 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct32x32_neon.h" // Most gcc 4.9 distributions outside of Android do not generate correct code // for this function. @@ -32,1289 +34,6 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, #else -#define LOAD_INCREMENT(src, stride, dest, index) \ - do { \ - dest[index] = vld1q_s16(src); \ - src += stride; \ - } while (0) - -#define ADD_S16(src, index0, index1, dest, index3) \ - do { \ - dest[index3] = vaddq_s16(src[index0], src[index1]); \ - } while (0) - -#define ADD_SHIFT_S16(src, index0, index1) \ - do { \ - src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \ - } while (0) - -// Load, cross, and multiply by 4. Load the first 8 and last 8, then the -// middle -// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better? -static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { - const int16_t *a_end = a + 24 * stride; - int16x8_t c[8]; - - LOAD_INCREMENT(a, stride, b, 0); - LOAD_INCREMENT(a, stride, b, 1); - LOAD_INCREMENT(a, stride, b, 2); - LOAD_INCREMENT(a, stride, b, 3); - LOAD_INCREMENT(a, stride, b, 4); - LOAD_INCREMENT(a, stride, b, 5); - LOAD_INCREMENT(a, stride, b, 6); - LOAD_INCREMENT(a, stride, b, 7); - - LOAD_INCREMENT(a_end, stride, b, 24); - LOAD_INCREMENT(a_end, stride, b, 25); - LOAD_INCREMENT(a_end, stride, b, 26); - LOAD_INCREMENT(a_end, stride, b, 27); - LOAD_INCREMENT(a_end, stride, b, 28); - LOAD_INCREMENT(a_end, stride, b, 29); - LOAD_INCREMENT(a_end, stride, b, 30); - LOAD_INCREMENT(a_end, stride, b, 31); - - ADD_S16(b, 0, 31, c, 0); - ADD_S16(b, 1, 30, c, 1); - ADD_S16(b, 2, 29, c, 2); - ADD_S16(b, 3, 28, c, 3); - ADD_S16(b, 4, 27, c, 4); - ADD_S16(b, 5, 26, c, 5); - ADD_S16(b, 6, 25, c, 6); - ADD_S16(b, 7, 24, c, 7); - - ADD_SHIFT_S16(b, 7, 24); - ADD_SHIFT_S16(b, 6, 25); - ADD_SHIFT_S16(b, 5, 26); - ADD_SHIFT_S16(b, 4, 27); - ADD_SHIFT_S16(b, 3, 28); - ADD_SHIFT_S16(b, 2, 29); - ADD_SHIFT_S16(b, 1, 30); - ADD_SHIFT_S16(b, 0, 31); - - b[0] = vshlq_n_s16(c[0], 2); - b[1] = vshlq_n_s16(c[1], 2); - b[2] = vshlq_n_s16(c[2], 2); - b[3] = vshlq_n_s16(c[3], 2); - b[4] = vshlq_n_s16(c[4], 2); - b[5] = vshlq_n_s16(c[5], 2); - b[6] = vshlq_n_s16(c[6], 2); - b[7] = vshlq_n_s16(c[7], 2); - - LOAD_INCREMENT(a, stride, b, 8); - LOAD_INCREMENT(a, stride, b, 9); - LOAD_INCREMENT(a, stride, b, 10); - LOAD_INCREMENT(a, stride, b, 11); - LOAD_INCREMENT(a, stride, b, 12); - LOAD_INCREMENT(a, stride, b, 13); - LOAD_INCREMENT(a, stride, b, 14); - LOAD_INCREMENT(a, stride, b, 15); - LOAD_INCREMENT(a, stride, b, 16); - LOAD_INCREMENT(a, stride, b, 17); - LOAD_INCREMENT(a, stride, b, 18); - LOAD_INCREMENT(a, stride, b, 19); - LOAD_INCREMENT(a, stride, b, 20); - LOAD_INCREMENT(a, stride, b, 21); - LOAD_INCREMENT(a, stride, b, 22); - LOAD_INCREMENT(a, stride, b, 23); - - ADD_S16(b, 8, 23, c, 0); - ADD_S16(b, 9, 22, c, 1); - ADD_S16(b, 10, 21, c, 2); - ADD_S16(b, 11, 20, c, 3); - ADD_S16(b, 12, 19, c, 4); - ADD_S16(b, 13, 18, c, 5); - ADD_S16(b, 14, 17, c, 6); - ADD_S16(b, 15, 16, c, 7); - - ADD_SHIFT_S16(b, 15, 16); - ADD_SHIFT_S16(b, 14, 17); - ADD_SHIFT_S16(b, 13, 18); - ADD_SHIFT_S16(b, 12, 19); - ADD_SHIFT_S16(b, 11, 20); - ADD_SHIFT_S16(b, 10, 21); - ADD_SHIFT_S16(b, 9, 22); - ADD_SHIFT_S16(b, 8, 23); - - b[8] = vshlq_n_s16(c[0], 2); - b[9] = vshlq_n_s16(c[1], 2); - b[10] = vshlq_n_s16(c[2], 2); - b[11] = vshlq_n_s16(c[3], 2); - b[12] = vshlq_n_s16(c[4], 2); - b[13] = vshlq_n_s16(c[5], 2); - b[14] = vshlq_n_s16(c[6], 2); - b[15] = vshlq_n_s16(c[7], 2); -} - -#undef LOAD_INCREMENT -#undef ADD_S16 -#undef ADD_SHIFT_S16 - -#define STORE_S16(src, index, dest) \ - do { \ - store_s16q_to_tran_low(dest, src[index]); \ - dest += 8; \ - } while (0) - -// Store 32 16x8 values, assuming stride == 32. -// Slight twist: store horizontally in blocks of 8. -static INLINE void store(tran_low_t *a, const int16x8_t *b) { - STORE_S16(b, 0, a); - STORE_S16(b, 8, a); - STORE_S16(b, 16, a); - STORE_S16(b, 24, a); - STORE_S16(b, 1, a); - STORE_S16(b, 9, a); - STORE_S16(b, 17, a); - STORE_S16(b, 25, a); - STORE_S16(b, 2, a); - STORE_S16(b, 10, a); - STORE_S16(b, 18, a); - STORE_S16(b, 26, a); - STORE_S16(b, 3, a); - STORE_S16(b, 11, a); - STORE_S16(b, 19, a); - STORE_S16(b, 27, a); - STORE_S16(b, 4, a); - STORE_S16(b, 12, a); - STORE_S16(b, 20, a); - STORE_S16(b, 28, a); - STORE_S16(b, 5, a); - STORE_S16(b, 13, a); - STORE_S16(b, 21, a); - STORE_S16(b, 29, a); - STORE_S16(b, 6, a); - STORE_S16(b, 14, a); - STORE_S16(b, 22, a); - STORE_S16(b, 30, a); - STORE_S16(b, 7, a); - STORE_S16(b, 15, a); - STORE_S16(b, 23, a); - STORE_S16(b, 31, a); -} - -#undef STORE_S16 - -// fdct_round_shift((a +/- b) * c) -static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b, - const tran_high_t constant, - int16x8_t *add, int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// fdct_round_shift(a * c0 +/- b * c1) -static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, - const tran_coef_t constant0, - const tran_coef_t constant1, - int16x8_t *add, int16x8_t *sub) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0); - const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1); - const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1); - const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0); - const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1); - const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); - const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); - const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); - const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); - *add = vcombine_s16(rounded0, rounded1); - *sub = vcombine_s16(rounded2, rounded3); -} - -// Add 2 if positive, 1 if negative, and shift by 2. -// In practice, subtract the sign bit, then shift with rounding. -static INLINE int16x8_t sub_round_shift(const int16x8_t a) { - const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); - const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); - const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); - return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); -} - -static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - - // Stage 1: Done as part of the load. - - // Stage 2. - // Mini cross. X the first 16 values and the middle 8 of the second half. - a[0] = vaddq_s16(in[0], in[15]); - a[1] = vaddq_s16(in[1], in[14]); - a[2] = vaddq_s16(in[2], in[13]); - a[3] = vaddq_s16(in[3], in[12]); - a[4] = vaddq_s16(in[4], in[11]); - a[5] = vaddq_s16(in[5], in[10]); - a[6] = vaddq_s16(in[6], in[9]); - a[7] = vaddq_s16(in[7], in[8]); - - a[8] = vsubq_s16(in[7], in[8]); - a[9] = vsubq_s16(in[6], in[9]); - a[10] = vsubq_s16(in[5], in[10]); - a[11] = vsubq_s16(in[4], in[11]); - a[12] = vsubq_s16(in[3], in[12]); - a[13] = vsubq_s16(in[2], in[13]); - a[14] = vsubq_s16(in[1], in[14]); - a[15] = vsubq_s16(in[0], in[15]); - - a[16] = in[16]; - a[17] = in[17]; - a[18] = in[18]; - a[19] = in[19]; - - butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]); - butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]); - butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]); - butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]); - - a[28] = in[28]; - a[29] = in[29]; - a[30] = in[30]; - a[31] = in[31]; - - // Stage 3. - b[0] = vaddq_s16(a[0], a[7]); - b[1] = vaddq_s16(a[1], a[6]); - b[2] = vaddq_s16(a[2], a[5]); - b[3] = vaddq_s16(a[3], a[4]); - - b[4] = vsubq_s16(a[3], a[4]); - b[5] = vsubq_s16(a[2], a[5]); - b[6] = vsubq_s16(a[1], a[6]); - b[7] = vsubq_s16(a[0], a[7]); - - b[8] = a[8]; - b[9] = a[9]; - - butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]); - butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]); - - b[14] = a[14]; - b[15] = a[15]; - - b[16] = vaddq_s16(in[16], a[23]); - b[17] = vaddq_s16(in[17], a[22]); - b[18] = vaddq_s16(in[18], a[21]); - b[19] = vaddq_s16(in[19], a[20]); - - b[20] = vsubq_s16(in[19], a[20]); - b[21] = vsubq_s16(in[18], a[21]); - b[22] = vsubq_s16(in[17], a[22]); - b[23] = vsubq_s16(in[16], a[23]); - - b[24] = vsubq_s16(in[31], a[24]); - b[25] = vsubq_s16(in[30], a[25]); - b[26] = vsubq_s16(in[29], a[26]); - b[27] = vsubq_s16(in[28], a[27]); - - b[28] = vaddq_s16(in[28], a[27]); - b[29] = vaddq_s16(in[29], a[26]); - b[30] = vaddq_s16(in[30], a[25]); - b[31] = vaddq_s16(in[31], a[24]); - - // Stage 4. - a[0] = vaddq_s16(b[0], b[3]); - a[1] = vaddq_s16(b[1], b[2]); - a[2] = vsubq_s16(b[1], b[2]); - a[3] = vsubq_s16(b[0], b[3]); - - a[4] = b[4]; - - butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]); - - a[7] = b[7]; - - a[8] = vaddq_s16(b[8], b[11]); - a[9] = vaddq_s16(b[9], b[10]); - a[10] = vsubq_s16(b[9], b[10]); - a[11] = vsubq_s16(b[8], b[11]); - a[12] = vsubq_s16(b[15], b[12]); - a[13] = vsubq_s16(b[14], b[13]); - a[14] = vaddq_s16(b[14], b[13]); - a[15] = vaddq_s16(b[15], b[12]); - - a[16] = b[16]; - a[17] = b[17]; - - butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]); - butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]); - butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]); - butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]); - - a[22] = b[22]; - a[23] = b[23]; - a[24] = b[24]; - a[25] = b[25]; - - a[30] = b[30]; - a[31] = b[31]; - - // Stage 5. - butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]); - butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]); - - b[4] = vaddq_s16(a[4], a[5]); - b[5] = vsubq_s16(a[4], a[5]); - b[6] = vsubq_s16(a[7], a[6]); - b[7] = vaddq_s16(a[7], a[6]); - - b[8] = a[8]; - - butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]); - butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]); - - b[11] = a[11]; - b[12] = a[12]; - - b[15] = a[15]; - - b[16] = vaddq_s16(a[19], a[16]); - b[17] = vaddq_s16(a[18], a[17]); - b[18] = vsubq_s16(a[17], a[18]); - b[19] = vsubq_s16(a[16], a[19]); - b[20] = vsubq_s16(a[23], a[20]); - b[21] = vsubq_s16(a[22], a[21]); - b[22] = vaddq_s16(a[21], a[22]); - b[23] = vaddq_s16(a[20], a[23]); - b[24] = vaddq_s16(a[27], a[24]); - b[25] = vaddq_s16(a[26], a[25]); - b[26] = vsubq_s16(a[25], a[26]); - b[27] = vsubq_s16(a[24], a[27]); - b[28] = vsubq_s16(a[31], a[28]); - b[29] = vsubq_s16(a[30], a[29]); - b[30] = vaddq_s16(a[29], a[30]); - b[31] = vaddq_s16(a[28], a[31]); - - // Stage 6. - a[0] = b[0]; - a[1] = b[1]; - a[2] = b[2]; - a[3] = b[3]; - - butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]); - butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]); - - a[8] = vaddq_s16(b[8], b[9]); - a[9] = vsubq_s16(b[8], b[9]); - a[10] = vsubq_s16(b[11], b[10]); - a[11] = vaddq_s16(b[11], b[10]); - a[12] = vaddq_s16(b[12], b[13]); - a[13] = vsubq_s16(b[12], b[13]); - a[14] = vsubq_s16(b[15], b[14]); - a[15] = vaddq_s16(b[15], b[14]); - - a[16] = b[16]; - a[19] = b[19]; - a[20] = b[20]; - a[23] = b[23]; - a[24] = b[24]; - a[27] = b[27]; - a[28] = b[28]; - a[31] = b[31]; - - butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]); - butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]); - - butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]); - butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]); - - // Stage 7. - b[0] = a[0]; - b[1] = a[1]; - b[2] = a[2]; - b[3] = a[3]; - b[4] = a[4]; - b[5] = a[5]; - b[6] = a[6]; - b[7] = a[7]; - - butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]); - butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]); - butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]); - butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]); - - b[16] = vaddq_s16(a[16], a[17]); - b[17] = vsubq_s16(a[16], a[17]); - b[18] = vsubq_s16(a[19], a[18]); - b[19] = vaddq_s16(a[19], a[18]); - b[20] = vaddq_s16(a[20], a[21]); - b[21] = vsubq_s16(a[20], a[21]); - b[22] = vsubq_s16(a[23], a[22]); - b[23] = vaddq_s16(a[23], a[22]); - b[24] = vaddq_s16(a[24], a[25]); - b[25] = vsubq_s16(a[24], a[25]); - b[26] = vsubq_s16(a[27], a[26]); - b[27] = vaddq_s16(a[27], a[26]); - b[28] = vaddq_s16(a[28], a[29]); - b[29] = vsubq_s16(a[28], a[29]); - b[30] = vsubq_s16(a[31], a[30]); - b[31] = vaddq_s16(a[31], a[30]); - - // Final stage. - // Also compute partial rounding shift: - // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - out[0] = sub_round_shift(b[0]); - out[16] = sub_round_shift(b[1]); - out[8] = sub_round_shift(b[2]); - out[24] = sub_round_shift(b[3]); - out[4] = sub_round_shift(b[4]); - out[20] = sub_round_shift(b[5]); - out[12] = sub_round_shift(b[6]); - out[28] = sub_round_shift(b[7]); - out[2] = sub_round_shift(b[8]); - out[18] = sub_round_shift(b[9]); - out[10] = sub_round_shift(b[10]); - out[26] = sub_round_shift(b[11]); - out[6] = sub_round_shift(b[12]); - out[22] = sub_round_shift(b[13]); - out[14] = sub_round_shift(b[14]); - out[30] = sub_round_shift(b[15]); - - butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]); - out[1] = sub_round_shift(a[1]); - out[31] = sub_round_shift(a[31]); - - butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]); - out[17] = sub_round_shift(a[17]); - out[15] = sub_round_shift(a[15]); - - butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]); - out[9] = sub_round_shift(a[9]); - out[23] = sub_round_shift(a[23]); - - butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]); - out[25] = sub_round_shift(a[25]); - out[7] = sub_round_shift(a[7]); - - butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]); - out[5] = sub_round_shift(a[5]); - out[27] = sub_round_shift(a[27]); - - butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]); - out[21] = sub_round_shift(a[21]); - out[11] = sub_round_shift(a[11]); - - butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]); - out[13] = sub_round_shift(a[13]); - out[19] = sub_round_shift(a[19]); - - butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]); - out[29] = sub_round_shift(a[29]); - out[3] = sub_round_shift(a[3]); -} - -#define PASS_THROUGH(src, dst, element) \ - do { \ - dst##_lo[element] = src##_lo[element]; \ - dst##_hi[element] = src##_hi[element]; \ - } while (0) - -#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = \ - vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ - b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ - vget_high_s16(a[right_index])); \ - } while (0) - -#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = \ - vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ - b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ - vget_high_s16(a[right_index])); \ - } while (0) - -#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ - do { \ - c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ - c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ - } while (0) - -#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ - do { \ - temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ - temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ - c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ - c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ - } while (0) - -#define ADD_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ - b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ - } while (0) - -#define SUB_S32(a, left_index, right_index, b, b_index) \ - do { \ - b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ - b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ - } while (0) - -// Like butterfly_one_coeff, but don't narrow results. -static INLINE void butterfly_one_coeff_s16_s32( - const int16x8_t a, const int16x8_t b, const tran_high_t constant, - int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, - int32x4_t *sub_hi) { - const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); - const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); - const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); - const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); - const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); -} - -#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ - add_index, sub_index) \ - do { \ - butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ - &b##_lo[add_index], &b##_hi[add_index], \ - &b##_lo[sub_index], &b##_hi[sub_index]); \ - } while (0) - -// Like butterfly_one_coeff, but with s32. -static INLINE void butterfly_one_coeff_s32( - const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, - const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo, - int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { - const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant); - const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant); - const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant); - const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant); - const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant); - const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); -} - -#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ - sub_index) \ - do { \ - butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ - a##_lo[right_index], a##_hi[right_index], \ - constant, &b##_lo[add_index], &b##_hi[add_index], \ - &b##_lo[sub_index], &b##_hi[sub_index]); \ - } while (0) - -// Like butterfly_two_coeff, but with s32. -static INLINE void butterfly_two_coeff_s32( - const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, - const int32x4_t b_hi, const int32_t constant0, const int32_t constant1, - int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, - int32x4_t *sub_hi) { - const int32x4_t a0 = vmulq_n_s32(a_lo, constant0); - const int32x4_t a1 = vmulq_n_s32(a_hi, constant0); - const int32x4_t a2 = vmulq_n_s32(a_lo, constant1); - const int32x4_t a3 = vmulq_n_s32(a_hi, constant1); - const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0); - const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0); - const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1); - const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1); - *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); - *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); - *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); - *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); -} - -#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ - right_constant, b, add_index, sub_index) \ - do { \ - butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ - a##_lo[right_index], a##_hi[right_index], \ - left_constant, right_constant, &b##_lo[add_index], \ - &b##_hi[add_index], &b##_lo[sub_index], \ - &b##_hi[sub_index]); \ - } while (0) - -// Add 1 if positive, 2 if negative, and shift by 2. -// In practice, add 1, then add the sign bit, then shift without rounding. -static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo, - const int32x4_t a_hi) { - const int32x4_t one = vdupq_n_s32(1); - const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); - const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); - const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); - const int16x4_t b_lo = - vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); - const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); - const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); - const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); - const int16x4_t b_hi = - vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); - return vcombine_s16(b_lo, b_hi); -} - -static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - int32x4_t c_lo[32]; - int32x4_t c_hi[32]; - int32x4_t d_lo[32]; - int32x4_t d_hi[32]; - - // Stage 1. Done as part of the load for the first pass. - a[0] = vaddq_s16(in[0], in[31]); - a[1] = vaddq_s16(in[1], in[30]); - a[2] = vaddq_s16(in[2], in[29]); - a[3] = vaddq_s16(in[3], in[28]); - a[4] = vaddq_s16(in[4], in[27]); - a[5] = vaddq_s16(in[5], in[26]); - a[6] = vaddq_s16(in[6], in[25]); - a[7] = vaddq_s16(in[7], in[24]); - a[8] = vaddq_s16(in[8], in[23]); - a[9] = vaddq_s16(in[9], in[22]); - a[10] = vaddq_s16(in[10], in[21]); - a[11] = vaddq_s16(in[11], in[20]); - a[12] = vaddq_s16(in[12], in[19]); - a[13] = vaddq_s16(in[13], in[18]); - a[14] = vaddq_s16(in[14], in[17]); - a[15] = vaddq_s16(in[15], in[16]); - a[16] = vsubq_s16(in[15], in[16]); - a[17] = vsubq_s16(in[14], in[17]); - a[18] = vsubq_s16(in[13], in[18]); - a[19] = vsubq_s16(in[12], in[19]); - a[20] = vsubq_s16(in[11], in[20]); - a[21] = vsubq_s16(in[10], in[21]); - a[22] = vsubq_s16(in[9], in[22]); - a[23] = vsubq_s16(in[8], in[23]); - a[24] = vsubq_s16(in[7], in[24]); - a[25] = vsubq_s16(in[6], in[25]); - a[26] = vsubq_s16(in[5], in[26]); - a[27] = vsubq_s16(in[4], in[27]); - a[28] = vsubq_s16(in[3], in[28]); - a[29] = vsubq_s16(in[2], in[29]); - a[30] = vsubq_s16(in[1], in[30]); - a[31] = vsubq_s16(in[0], in[31]); - - // Stage 2. - b[0] = vaddq_s16(a[0], a[15]); - b[1] = vaddq_s16(a[1], a[14]); - b[2] = vaddq_s16(a[2], a[13]); - b[3] = vaddq_s16(a[3], a[12]); - b[4] = vaddq_s16(a[4], a[11]); - b[5] = vaddq_s16(a[5], a[10]); - b[6] = vaddq_s16(a[6], a[9]); - b[7] = vaddq_s16(a[7], a[8]); - - b[8] = vsubq_s16(a[7], a[8]); - b[9] = vsubq_s16(a[6], a[9]); - b[10] = vsubq_s16(a[5], a[10]); - b[11] = vsubq_s16(a[4], a[11]); - b[12] = vsubq_s16(a[3], a[12]); - b[13] = vsubq_s16(a[2], a[13]); - b[14] = vsubq_s16(a[1], a[14]); - b[15] = vsubq_s16(a[0], a[15]); - - b[16] = a[16]; - b[17] = a[17]; - b[18] = a[18]; - b[19] = a[19]; - - butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); - butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); - butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); - butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); - - b[28] = a[28]; - b[29] = a[29]; - b[30] = a[30]; - b[31] = a[31]; - - // Stage 3. With extreme values for input this calculation rolls over int16_t. - // The sources for b[0] get added multiple times and, through testing, have - // been shown to overflow starting here. - ADD_S16_S32(b, 0, 7, c, 0); - ADD_S16_S32(b, 1, 6, c, 1); - ADD_S16_S32(b, 2, 5, c, 2); - ADD_S16_S32(b, 3, 4, c, 3); - SUB_S16_S32(b, 3, 4, c, 4); - SUB_S16_S32(b, 2, 5, c, 5); - SUB_S16_S32(b, 1, 6, c, 6); - SUB_S16_S32(b, 0, 7, c, 7); - - a[8] = b[8]; - a[9] = b[9]; - - BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); - BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); - - a[14] = b[14]; - a[15] = b[15]; - - ADD_S16_S32(b, 16, 23, c, 16); - ADD_S16_S32(b, 17, 22, c, 17); - ADD_S16_S32(b, 18, 21, c, 18); - ADD_S16_S32(b, 19, 20, c, 19); - SUB_S16_S32(b, 19, 20, c, 20); - SUB_S16_S32(b, 18, 21, c, 21); - SUB_S16_S32(b, 17, 22, c, 22); - SUB_S16_S32(b, 16, 23, c, 23); - SUB_S16_S32(b, 31, 24, c, 24); - SUB_S16_S32(b, 30, 25, c, 25); - SUB_S16_S32(b, 29, 26, c, 26); - SUB_S16_S32(b, 28, 27, c, 27); - ADD_S16_S32(b, 28, 27, c, 28); - ADD_S16_S32(b, 29, 26, c, 29); - ADD_S16_S32(b, 30, 25, c, 30); - ADD_S16_S32(b, 31, 24, c, 31); - - // Stage 4. - ADD_S32(c, 0, 3, d, 0); - ADD_S32(c, 1, 2, d, 1); - SUB_S32(c, 1, 2, d, 2); - SUB_S32(c, 0, 3, d, 3); - - PASS_THROUGH(c, d, 4); - - BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); - - PASS_THROUGH(c, d, 7); - - ADDW_S16_S32(c, 11, a, 8, d, 8); - ADDW_S16_S32(c, 10, a, 9, d, 9); - SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); - SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); - SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); - SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); - ADDW_S16_S32(c, 13, b, 14, d, 14); - ADDW_S16_S32(c, 12, b, 15, d, 15); - - PASS_THROUGH(c, d, 16); - PASS_THROUGH(c, d, 17); - - BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18); - BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19); - BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20); - BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21); - - PASS_THROUGH(c, d, 22); - PASS_THROUGH(c, d, 23); - PASS_THROUGH(c, d, 24); - PASS_THROUGH(c, d, 25); - - PASS_THROUGH(c, d, 30); - PASS_THROUGH(c, d, 31); - - // Stage 5. - BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); - BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3); - - ADD_S32(d, 4, 5, c, 4); - SUB_S32(d, 4, 5, c, 5); - SUB_S32(d, 7, 6, c, 6); - ADD_S32(d, 7, 6, c, 7); - - PASS_THROUGH(d, c, 8); - - BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9); - BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10); - - PASS_THROUGH(d, c, 11); - PASS_THROUGH(d, c, 12); - PASS_THROUGH(d, c, 15); - - ADD_S32(d, 16, 19, c, 16); - ADD_S32(d, 17, 18, c, 17); - SUB_S32(d, 17, 18, c, 18); - SUB_S32(d, 16, 19, c, 19); - SUB_S32(d, 23, 20, c, 20); - SUB_S32(d, 22, 21, c, 21); - ADD_S32(d, 22, 21, c, 22); - ADD_S32(d, 23, 20, c, 23); - ADD_S32(d, 24, 27, c, 24); - ADD_S32(d, 25, 26, c, 25); - SUB_S32(d, 25, 26, c, 26); - SUB_S32(d, 24, 27, c, 27); - SUB_S32(d, 31, 28, c, 28); - SUB_S32(d, 30, 29, c, 29); - ADD_S32(d, 30, 29, c, 30); - ADD_S32(d, 31, 28, c, 31); - - // Stage 6. - PASS_THROUGH(c, d, 0); - PASS_THROUGH(c, d, 1); - PASS_THROUGH(c, d, 2); - PASS_THROUGH(c, d, 3); - - BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7); - BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6); - - ADD_S32(c, 8, 9, d, 8); - SUB_S32(c, 8, 9, d, 9); - SUB_S32(c, 11, 10, d, 10); - ADD_S32(c, 11, 10, d, 11); - ADD_S32(c, 12, 13, d, 12); - SUB_S32(c, 12, 13, d, 13); - SUB_S32(c, 15, 14, d, 14); - ADD_S32(c, 15, 14, d, 15); - - PASS_THROUGH(c, d, 16); - PASS_THROUGH(c, d, 19); - PASS_THROUGH(c, d, 20); - PASS_THROUGH(c, d, 23); - PASS_THROUGH(c, d, 24); - PASS_THROUGH(c, d, 27); - PASS_THROUGH(c, d, 28); - PASS_THROUGH(c, d, 31); - - BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17); - BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18); - BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21); - BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22); - - // Stage 7. - PASS_THROUGH(d, c, 0); - PASS_THROUGH(d, c, 1); - PASS_THROUGH(d, c, 2); - PASS_THROUGH(d, c, 3); - PASS_THROUGH(d, c, 4); - PASS_THROUGH(d, c, 5); - PASS_THROUGH(d, c, 6); - PASS_THROUGH(d, c, 7); - - BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15); - BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14); - BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13); - BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12); - - ADD_S32(d, 16, 17, c, 16); - SUB_S32(d, 16, 17, c, 17); - SUB_S32(d, 19, 18, c, 18); - ADD_S32(d, 19, 18, c, 19); - ADD_S32(d, 20, 21, c, 20); - SUB_S32(d, 20, 21, c, 21); - SUB_S32(d, 23, 22, c, 22); - ADD_S32(d, 23, 22, c, 23); - ADD_S32(d, 24, 25, c, 24); - SUB_S32(d, 24, 25, c, 25); - SUB_S32(d, 27, 26, c, 26); - ADD_S32(d, 27, 26, c, 27); - ADD_S32(d, 28, 29, c, 28); - SUB_S32(d, 28, 29, c, 29); - SUB_S32(d, 31, 30, c, 30); - ADD_S32(d, 31, 30, c, 31); - - // Final stage. - // Roll rounding into this function so we can pass back int16x8. - - out[0] = add_round_shift_s32(c_lo[0], c_hi[0]); - out[16] = add_round_shift_s32(c_lo[1], c_hi[1]); - - out[8] = add_round_shift_s32(c_lo[2], c_hi[2]); - out[24] = add_round_shift_s32(c_lo[3], c_hi[3]); - out[4] = add_round_shift_s32(c_lo[4], c_hi[4]); - out[20] = add_round_shift_s32(c_lo[5], c_hi[5]); - out[12] = add_round_shift_s32(c_lo[6], c_hi[6]); - - out[28] = add_round_shift_s32(c_lo[7], c_hi[7]); - out[2] = add_round_shift_s32(c_lo[8], c_hi[8]); - out[18] = add_round_shift_s32(c_lo[9], c_hi[9]); - out[10] = add_round_shift_s32(c_lo[10], c_hi[10]); - - out[26] = add_round_shift_s32(c_lo[11], c_hi[11]); - out[6] = add_round_shift_s32(c_lo[12], c_hi[12]); - out[22] = add_round_shift_s32(c_lo[13], c_hi[13]); - out[14] = add_round_shift_s32(c_lo[14], c_hi[14]); - out[30] = add_round_shift_s32(c_lo[15], c_hi[15]); - - BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31); - out[1] = add_round_shift_s32(d_lo[1], d_hi[1]); - out[31] = add_round_shift_s32(d_lo[31], d_hi[31]); - - BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15); - out[17] = add_round_shift_s32(d_lo[17], d_hi[17]); - out[15] = add_round_shift_s32(d_lo[15], d_hi[15]); - - BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23); - out[9] = add_round_shift_s32(d_lo[9], d_hi[9]); - out[23] = add_round_shift_s32(d_lo[23], d_hi[23]); - - BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7); - out[25] = add_round_shift_s32(d_lo[25], d_hi[25]); - out[7] = add_round_shift_s32(d_lo[7], d_hi[7]); - - BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27); - out[5] = add_round_shift_s32(d_lo[5], d_hi[5]); - out[27] = add_round_shift_s32(d_lo[27], d_hi[27]); - - BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11); - out[21] = add_round_shift_s32(d_lo[21], d_hi[21]); - out[11] = add_round_shift_s32(d_lo[11], d_hi[11]); - - BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19); - out[13] = add_round_shift_s32(d_lo[13], d_hi[13]); - out[19] = add_round_shift_s32(d_lo[19], d_hi[19]); - - BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3); - out[29] = add_round_shift_s32(d_lo[29], d_hi[29]); - out[3] = add_round_shift_s32(d_lo[3], d_hi[3]); -} - -// Add 1 if positive, 2 if negative, and shift by 2. -// In practice, add 1, then add the sign bit, then shift without rounding. -static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { - const int16x8_t one = vdupq_n_s16(1); - const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); - const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); - const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); - return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); -} - -static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) { - int16x8_t a[32]; - int16x8_t b[32]; - - // Stage 1. Done as part of the load for the first pass. - a[0] = vaddq_s16(in[0], in[31]); - a[1] = vaddq_s16(in[1], in[30]); - a[2] = vaddq_s16(in[2], in[29]); - a[3] = vaddq_s16(in[3], in[28]); - a[4] = vaddq_s16(in[4], in[27]); - a[5] = vaddq_s16(in[5], in[26]); - a[6] = vaddq_s16(in[6], in[25]); - a[7] = vaddq_s16(in[7], in[24]); - a[8] = vaddq_s16(in[8], in[23]); - a[9] = vaddq_s16(in[9], in[22]); - a[10] = vaddq_s16(in[10], in[21]); - a[11] = vaddq_s16(in[11], in[20]); - a[12] = vaddq_s16(in[12], in[19]); - a[13] = vaddq_s16(in[13], in[18]); - a[14] = vaddq_s16(in[14], in[17]); - a[15] = vaddq_s16(in[15], in[16]); - a[16] = vsubq_s16(in[15], in[16]); - a[17] = vsubq_s16(in[14], in[17]); - a[18] = vsubq_s16(in[13], in[18]); - a[19] = vsubq_s16(in[12], in[19]); - a[20] = vsubq_s16(in[11], in[20]); - a[21] = vsubq_s16(in[10], in[21]); - a[22] = vsubq_s16(in[9], in[22]); - a[23] = vsubq_s16(in[8], in[23]); - a[24] = vsubq_s16(in[7], in[24]); - a[25] = vsubq_s16(in[6], in[25]); - a[26] = vsubq_s16(in[5], in[26]); - a[27] = vsubq_s16(in[4], in[27]); - a[28] = vsubq_s16(in[3], in[28]); - a[29] = vsubq_s16(in[2], in[29]); - a[30] = vsubq_s16(in[1], in[30]); - a[31] = vsubq_s16(in[0], in[31]); - - // Stage 2. - // For the "rd" version, all the values are rounded down after stage 2 to keep - // the values in 16 bits. - b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); - b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); - b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); - b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); - b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); - b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); - b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); - b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); - - b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); - b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); - b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); - b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); - b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); - b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); - b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); - b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); - - b[16] = add_round_shift_s16(a[16]); - b[17] = add_round_shift_s16(a[17]); - b[18] = add_round_shift_s16(a[18]); - b[19] = add_round_shift_s16(a[19]); - - butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]); - butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]); - butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]); - butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]); - b[20] = add_round_shift_s16(b[20]); - b[21] = add_round_shift_s16(b[21]); - b[22] = add_round_shift_s16(b[22]); - b[23] = add_round_shift_s16(b[23]); - b[24] = add_round_shift_s16(b[24]); - b[25] = add_round_shift_s16(b[25]); - b[26] = add_round_shift_s16(b[26]); - b[27] = add_round_shift_s16(b[27]); - - b[28] = add_round_shift_s16(a[28]); - b[29] = add_round_shift_s16(a[29]); - b[30] = add_round_shift_s16(a[30]); - b[31] = add_round_shift_s16(a[31]); - - // Stage 3. - a[0] = vaddq_s16(b[0], b[7]); - a[1] = vaddq_s16(b[1], b[6]); - a[2] = vaddq_s16(b[2], b[5]); - a[3] = vaddq_s16(b[3], b[4]); - - a[4] = vsubq_s16(b[3], b[4]); - a[5] = vsubq_s16(b[2], b[5]); - a[6] = vsubq_s16(b[1], b[6]); - a[7] = vsubq_s16(b[0], b[7]); - - a[8] = b[8]; - a[9] = b[9]; - - butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]); - butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]); - - a[14] = b[14]; - a[15] = b[15]; - - a[16] = vaddq_s16(b[16], b[23]); - a[17] = vaddq_s16(b[17], b[22]); - a[18] = vaddq_s16(b[18], b[21]); - a[19] = vaddq_s16(b[19], b[20]); - - a[20] = vsubq_s16(b[19], b[20]); - a[21] = vsubq_s16(b[18], b[21]); - a[22] = vsubq_s16(b[17], b[22]); - a[23] = vsubq_s16(b[16], b[23]); - - a[24] = vsubq_s16(b[31], b[24]); - a[25] = vsubq_s16(b[30], b[25]); - a[26] = vsubq_s16(b[29], b[26]); - a[27] = vsubq_s16(b[28], b[27]); - - a[28] = vaddq_s16(b[28], b[27]); - a[29] = vaddq_s16(b[29], b[26]); - a[30] = vaddq_s16(b[30], b[25]); - a[31] = vaddq_s16(b[31], b[24]); - - // Stage 4. - b[0] = vaddq_s16(a[0], a[3]); - b[1] = vaddq_s16(a[1], a[2]); - b[2] = vsubq_s16(a[1], a[2]); - b[3] = vsubq_s16(a[0], a[3]); - - b[4] = a[4]; - - butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]); - - b[7] = a[7]; - - b[8] = vaddq_s16(a[8], a[11]); - b[9] = vaddq_s16(a[9], a[10]); - b[10] = vsubq_s16(a[9], a[10]); - b[11] = vsubq_s16(a[8], a[11]); - b[12] = vsubq_s16(a[15], a[12]); - b[13] = vsubq_s16(a[14], a[13]); - b[14] = vaddq_s16(a[14], a[13]); - b[15] = vaddq_s16(a[15], a[12]); - - b[16] = a[16]; - b[17] = a[17]; - - butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]); - butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]); - butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]); - butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]); - - b[22] = a[22]; - b[23] = a[23]; - b[24] = a[24]; - b[25] = a[25]; - - b[30] = a[30]; - b[31] = a[31]; - - // Stage 5. - butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]); - butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]); - - a[4] = vaddq_s16(b[4], b[5]); - a[5] = vsubq_s16(b[4], b[5]); - a[6] = vsubq_s16(b[7], b[6]); - a[7] = vaddq_s16(b[7], b[6]); - - a[8] = b[8]; - - butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]); - butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]); - - a[11] = b[11]; - a[12] = b[12]; - - a[15] = b[15]; - - a[16] = vaddq_s16(b[19], b[16]); - a[17] = vaddq_s16(b[18], b[17]); - a[18] = vsubq_s16(b[17], b[18]); - a[19] = vsubq_s16(b[16], b[19]); - a[20] = vsubq_s16(b[23], b[20]); - a[21] = vsubq_s16(b[22], b[21]); - a[22] = vaddq_s16(b[21], b[22]); - a[23] = vaddq_s16(b[20], b[23]); - a[24] = vaddq_s16(b[27], b[24]); - a[25] = vaddq_s16(b[26], b[25]); - a[26] = vsubq_s16(b[25], b[26]); - a[27] = vsubq_s16(b[24], b[27]); - a[28] = vsubq_s16(b[31], b[28]); - a[29] = vsubq_s16(b[30], b[29]); - a[30] = vaddq_s16(b[29], b[30]); - a[31] = vaddq_s16(b[28], b[31]); - - // Stage 6. - b[0] = a[0]; - b[1] = a[1]; - b[2] = a[2]; - b[3] = a[3]; - - butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]); - butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]); - - b[8] = vaddq_s16(a[8], a[9]); - b[9] = vsubq_s16(a[8], a[9]); - b[10] = vsubq_s16(a[11], a[10]); - b[11] = vaddq_s16(a[11], a[10]); - b[12] = vaddq_s16(a[12], a[13]); - b[13] = vsubq_s16(a[12], a[13]); - b[14] = vsubq_s16(a[15], a[14]); - b[15] = vaddq_s16(a[15], a[14]); - - b[16] = a[16]; - b[19] = a[19]; - b[20] = a[20]; - b[23] = a[23]; - b[24] = a[24]; - b[27] = a[27]; - b[28] = a[28]; - b[31] = a[31]; - - butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]); - butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]); - - butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]); - butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]); - - // Stage 7. - a[0] = b[0]; - a[1] = b[1]; - a[2] = b[2]; - a[3] = b[3]; - a[4] = b[4]; - a[5] = b[5]; - a[6] = b[6]; - a[7] = b[7]; - - butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]); - butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]); - butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]); - butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]); - - a[16] = vaddq_s16(b[16], b[17]); - a[17] = vsubq_s16(b[16], b[17]); - a[18] = vsubq_s16(b[19], b[18]); - a[19] = vaddq_s16(b[19], b[18]); - a[20] = vaddq_s16(b[20], b[21]); - a[21] = vsubq_s16(b[20], b[21]); - a[22] = vsubq_s16(b[23], b[22]); - a[23] = vaddq_s16(b[23], b[22]); - a[24] = vaddq_s16(b[24], b[25]); - a[25] = vsubq_s16(b[24], b[25]); - a[26] = vsubq_s16(b[27], b[26]); - a[27] = vaddq_s16(b[27], b[26]); - a[28] = vaddq_s16(b[28], b[29]); - a[29] = vsubq_s16(b[28], b[29]); - a[30] = vsubq_s16(b[31], b[30]); - a[31] = vaddq_s16(b[31], b[30]); - - // Final stage. - out[0] = a[0]; - out[16] = a[1]; - out[8] = a[2]; - out[24] = a[3]; - out[4] = a[4]; - out[20] = a[5]; - out[12] = a[6]; - out[28] = a[7]; - out[2] = a[8]; - out[18] = a[9]; - out[10] = a[10]; - out[26] = a[11]; - out[6] = a[12]; - out[22] = a[13]; - out[14] = a[14]; - out[30] = a[15]; - - butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]); - butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17], - &out[15]); - butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]); - butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]); - butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]); - butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21], - &out[11]); - butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13], - &out[19]); - butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]); -} - -#undef PASS_THROUGH -#undef ADD_S16_S32 -#undef SUB_S16_S32 -#undef ADDW_S16_S32 -#undef SUBW_S16_S32 -#undef ADD_S32 -#undef SUB_S32 -#undef BUTTERFLY_ONE_S16_S32 -#undef BUTTERFLY_ONE_S32 -#undef BUTTERFLY_TWO_S32 - -// Transpose 8x8 to a new location. Don't use transpose_neon.h because those -// are all in-place. -// TODO(johannkoenig): share with other fdcts. -static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); - - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); - - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); - - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; -} - void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp0[32]; int16x8_t temp1[32]; @@ -1324,23 +43,27 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { int16x8_t temp5[32]; // Process in 8x32 columns. - load(input, stride, temp0); - dct_body_first_pass(temp0, temp1); + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); - load(input + 8, stride, temp0); - dct_body_first_pass(temp0, temp2); + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); - load(input + 16, stride, temp0); - dct_body_first_pass(temp0, temp3); + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); - load(input + 24, stride, temp0); - dct_body_first_pass(temp0, temp4); + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_8x8(&temp1[0], &temp0[0]); - transpose_8x8(&temp2[0], &temp0[8]); - transpose_8x8(&temp3[0], &temp0[16]); - transpose_8x8(&temp4[0], &temp0[24]); + transpose_s16_8x8_new(&temp1[0], &temp0[0]); + transpose_s16_8x8_new(&temp2[0], &temp0[8]); + transpose_s16_8x8_new(&temp3[0], &temp0[16]); + transpose_s16_8x8_new(&temp4[0], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1355,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output, temp5); // Second row of 8x32. - transpose_8x8(&temp1[8], &temp0[0]); - transpose_8x8(&temp2[8], &temp0[8]); - transpose_8x8(&temp3[8], &temp0[16]); - transpose_8x8(&temp4[8], &temp0[24]); + transpose_s16_8x8_new(&temp1[8], &temp0[0]); + transpose_s16_8x8_new(&temp2[8], &temp0[8]); + transpose_s16_8x8_new(&temp3[8], &temp0[16]); + transpose_s16_8x8_new(&temp4[8], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1373,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_8x8(&temp1[16], &temp0[0]); - transpose_8x8(&temp2[16], &temp0[8]); - transpose_8x8(&temp3[16], &temp0[16]); - transpose_8x8(&temp4[16], &temp0[24]); + transpose_s16_8x8_new(&temp1[16], &temp0[0]); + transpose_s16_8x8_new(&temp2[16], &temp0[8]); + transpose_s16_8x8_new(&temp3[16], &temp0[16]); + transpose_s16_8x8_new(&temp4[16], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1391,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_8x8(&temp1[24], &temp0[0]); - transpose_8x8(&temp2[24], &temp0[8]); - transpose_8x8(&temp3[24], &temp0[16]); - transpose_8x8(&temp4[24], &temp0[24]); + transpose_s16_8x8_new(&temp1[24], &temp0[0]); + transpose_s16_8x8_new(&temp2[24], &temp0[8]); + transpose_s16_8x8_new(&temp3[24], &temp0[16]); + transpose_s16_8x8_new(&temp4[24], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -1419,23 +142,27 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int16x8_t temp5[32]; // Process in 8x32 columns. - load(input, stride, temp0); - dct_body_first_pass(temp0, temp1); + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); - load(input + 8, stride, temp0); - dct_body_first_pass(temp0, temp2); + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); - load(input + 16, stride, temp0); - dct_body_first_pass(temp0, temp3); + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); - load(input + 24, stride, temp0); - dct_body_first_pass(temp0, temp4); + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_8x8(&temp1[0], &temp0[0]); - transpose_8x8(&temp2[0], &temp0[8]); - transpose_8x8(&temp3[0], &temp0[16]); - transpose_8x8(&temp4[0], &temp0[24]); + transpose_s16_8x8_new(&temp1[0], &temp0[0]); + transpose_s16_8x8_new(&temp2[0], &temp0[8]); + transpose_s16_8x8_new(&temp3[0], &temp0[16]); + transpose_s16_8x8_new(&temp4[0], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1450,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output, temp5); // Second row of 8x32. - transpose_8x8(&temp1[8], &temp0[0]); - transpose_8x8(&temp2[8], &temp0[8]); - transpose_8x8(&temp3[8], &temp0[16]); - transpose_8x8(&temp4[8], &temp0[24]); + transpose_s16_8x8_new(&temp1[8], &temp0[0]); + transpose_s16_8x8_new(&temp2[8], &temp0[8]); + transpose_s16_8x8_new(&temp3[8], &temp0[16]); + transpose_s16_8x8_new(&temp4[8], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1468,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_8x8(&temp1[16], &temp0[0]); - transpose_8x8(&temp2[16], &temp0[8]); - transpose_8x8(&temp3[16], &temp0[16]); - transpose_8x8(&temp4[16], &temp0[24]); + transpose_s16_8x8_new(&temp1[16], &temp0[0]); + transpose_s16_8x8_new(&temp2[16], &temp0[8]); + transpose_s16_8x8_new(&temp3[16], &temp0[16]); + transpose_s16_8x8_new(&temp4[16], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1486,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_8x8(&temp1[24], &temp0[0]); - transpose_8x8(&temp2[24], &temp0[8]); - transpose_8x8(&temp3[24], &temp0[16]); - transpose_8x8(&temp4[24], &temp0[24]); + transpose_s16_8x8_new(&temp1[24], &temp0[0]); + transpose_s16_8x8_new(&temp2[24], &temp0[8]); + transpose_s16_8x8_new(&temp3[24], &temp0[16]); + transpose_s16_8x8_new(&temp4[24], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -1503,5 +230,190 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, &temp5[29], &temp5[30], &temp5[31]); store(output + 24 * 32, temp5); } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32], + right3[32], right4[32]; + int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32], + left8[32], right8[32]; + int32x4_t temp1[32], temp2[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + highbd_dct8x32_body_first_pass(left1, right1); + highbd_partial_sub_round_shift(left1, right1); + + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + highbd_dct8x32_body_first_pass(left2, right2); + highbd_partial_sub_round_shift(left2, right2); + + load_cross(input + 16, stride, temp0); + highbd_scale_input(temp0, left3, right3); + highbd_dct8x32_body_first_pass(left3, right3); + highbd_partial_sub_round_shift(left3, right3); + + load_cross(input + 24, stride, temp0); + highbd_scale_input(temp0, left4, right4); + highbd_dct8x32_body_first_pass(left4, right4); + highbd_partial_sub_round_shift(left4, right4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s32_8x8_2(left1, right1, temp1, temp2); + transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left5, right5); + highbd_dct8x32_body_second_pass(left5, right5); + highbd_partial_add_round_shift(left5, right5); + + // Second row of 8x32. + transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2); + transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left6, right6); + highbd_dct8x32_body_second_pass(left6, right6); + highbd_partial_add_round_shift(left6, right6); + + // Third row of 8x32 + transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2); + transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left7, right7); + highbd_dct8x32_body_second_pass(left7, right7); + highbd_partial_add_round_shift(left7, right7); + + // Final row of 8x32. + transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2); + transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left8, right8); + highbd_dct8x32_body_second_pass(left8, right8); + highbd_partial_add_round_shift(left8, right8); + + // Final transpose + transpose_s32_8x8_2(left5, right5, left1, right1); + transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2); + transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3); + transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4); + transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8); + transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8); + transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16); + transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16); + transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16); + transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16); + transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24); + transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24); + transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24); + transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24); + + store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4, + right4); +} + +void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32], + right3[32], right4[32]; + int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32], + left8[32], right8[32]; + int32x4_t temp1[32], temp2[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + highbd_dct8x32_body_first_pass(left1, right1); + highbd_partial_sub_round_shift(left1, right1); + + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + highbd_dct8x32_body_first_pass(left2, right2); + highbd_partial_sub_round_shift(left2, right2); + + load_cross(input + 16, stride, temp0); + highbd_scale_input(temp0, left3, right3); + highbd_dct8x32_body_first_pass(left3, right3); + highbd_partial_sub_round_shift(left3, right3); + + load_cross(input + 24, stride, temp0); + highbd_scale_input(temp0, left4, right4); + highbd_dct8x32_body_first_pass(left4, right4); + highbd_partial_sub_round_shift(left4, right4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s32_8x8_2(left1, right1, temp1, temp2); + transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left5, right5); + highbd_dct8x32_body_second_pass_rd(left5, right5); + + // Second row of 8x32. + transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2); + transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left6, right6); + highbd_dct8x32_body_second_pass_rd(left6, right6); + + // Third row of 8x32 + transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2); + transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left7, right7); + highbd_dct8x32_body_second_pass_rd(left7, right7); + + // Final row of 8x32. + transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2); + transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left8, right8); + highbd_dct8x32_body_second_pass_rd(left8, right8); + + // Final transpose + transpose_s32_8x8_2(left5, right5, left1, right1); + transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2); + transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3); + transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4); + transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8); + transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8); + transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16); + transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16); + transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16); + transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16); + transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24); + transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24); + transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24); + transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24); + + store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4, + right4); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && // __GNUC__ == 4 && __GNUC_MINOR__ <= 9 diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.h b/libvpx/vpx_dsp/arm/fdct32x32_neon.h new file mode 100644 index 000000000..3b9e64c6d --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.h @@ -0,0 +1,2919 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" + +// Load & cross the first 8 and last 8, then the middle +static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + + b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + + b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); + b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + + b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); +} + +#define STORE_S16(src, index, dest) \ + do { \ + store_s16q_to_tran_low(dest, src[index]); \ + dest += 8; \ + } while (0) + +// Store 32 16x8 values, assuming stride == 32. +// Slight twist: store horizontally in blocks of 8. +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + STORE_S16(b, 0, a); + STORE_S16(b, 8, a); + STORE_S16(b, 16, a); + STORE_S16(b, 24, a); + STORE_S16(b, 1, a); + STORE_S16(b, 9, a); + STORE_S16(b, 17, a); + STORE_S16(b, 25, a); + STORE_S16(b, 2, a); + STORE_S16(b, 10, a); + STORE_S16(b, 18, a); + STORE_S16(b, 26, a); + STORE_S16(b, 3, a); + STORE_S16(b, 11, a); + STORE_S16(b, 19, a); + STORE_S16(b, 27, a); + STORE_S16(b, 4, a); + STORE_S16(b, 12, a); + STORE_S16(b, 20, a); + STORE_S16(b, 28, a); + STORE_S16(b, 5, a); + STORE_S16(b, 13, a); + STORE_S16(b, 21, a); + STORE_S16(b, 29, a); + STORE_S16(b, 6, a); + STORE_S16(b, 14, a); + STORE_S16(b, 22, a); + STORE_S16(b, 30, a); + STORE_S16(b, 7, a); + STORE_S16(b, 15, a); + STORE_S16(b, 23, a); + STORE_S16(b, 31, a); +} + +#undef STORE_S16 + +static INLINE void scale_input(const int16x8_t *in /*32*/, + int16x8_t *out /*32*/) { + out[0] = vshlq_n_s16(in[0], 2); + out[1] = vshlq_n_s16(in[1], 2); + out[2] = vshlq_n_s16(in[2], 2); + out[3] = vshlq_n_s16(in[3], 2); + out[4] = vshlq_n_s16(in[4], 2); + out[5] = vshlq_n_s16(in[5], 2); + out[6] = vshlq_n_s16(in[6], 2); + out[7] = vshlq_n_s16(in[7], 2); + + out[8] = vshlq_n_s16(in[8], 2); + out[9] = vshlq_n_s16(in[9], 2); + out[10] = vshlq_n_s16(in[10], 2); + out[11] = vshlq_n_s16(in[11], 2); + out[12] = vshlq_n_s16(in[12], 2); + out[13] = vshlq_n_s16(in[13], 2); + out[14] = vshlq_n_s16(in[14], 2); + out[15] = vshlq_n_s16(in[15], 2); + + out[16] = vshlq_n_s16(in[16], 2); + out[17] = vshlq_n_s16(in[17], 2); + out[18] = vshlq_n_s16(in[18], 2); + out[19] = vshlq_n_s16(in[19], 2); + out[20] = vshlq_n_s16(in[20], 2); + out[21] = vshlq_n_s16(in[21], 2); + out[22] = vshlq_n_s16(in[22], 2); + out[23] = vshlq_n_s16(in[23], 2); + + out[24] = vshlq_n_s16(in[24], 2); + out[25] = vshlq_n_s16(in[25], 2); + out[26] = vshlq_n_s16(in[26], 2); + out[27] = vshlq_n_s16(in[27], 2); + out[28] = vshlq_n_s16(in[28], 2); + out[29] = vshlq_n_s16(in[29], 2); + out[30] = vshlq_n_s16(in[30], 2); + out[31] = vshlq_n_s16(in[31], 2); +} + +static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + a[0] = vaddq_s16(in[0], in[15]); + a[1] = vaddq_s16(in[1], in[14]); + a[2] = vaddq_s16(in[2], in[13]); + a[3] = vaddq_s16(in[3], in[12]); + a[4] = vaddq_s16(in[4], in[11]); + a[5] = vaddq_s16(in[5], in[10]); + a[6] = vaddq_s16(in[6], in[9]); + a[7] = vaddq_s16(in[7], in[8]); + + a[8] = vsubq_s16(in[7], in[8]); + a[9] = vsubq_s16(in[6], in[9]); + a[10] = vsubq_s16(in[5], in[10]); + a[11] = vsubq_s16(in[4], in[11]); + a[12] = vsubq_s16(in[3], in[12]); + a[13] = vsubq_s16(in[2], in[13]); + a[14] = vsubq_s16(in[1], in[14]); + a[15] = vsubq_s16(in[0], in[15]); + + a[16] = in[16]; + a[17] = in[17]; + a[18] = in[18]; + a[19] = in[19]; + + butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27], + &a[20]); + butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26], + &a[21]); + butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25], + &a[22]); + butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24], + &a[23]); + + a[28] = in[28]; + a[29] = in[29]; + a[30] = in[30]; + a[31] = in[31]; + + // Stage 3. + b[0] = vaddq_s16(a[0], a[7]); + b[1] = vaddq_s16(a[1], a[6]); + b[2] = vaddq_s16(a[2], a[5]); + b[3] = vaddq_s16(a[3], a[4]); + + b[4] = vsubq_s16(a[3], a[4]); + b[5] = vsubq_s16(a[2], a[5]); + b[6] = vsubq_s16(a[1], a[6]); + b[7] = vsubq_s16(a[0], a[7]); + + b[8] = a[8]; + b[9] = a[9]; + + butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]); + butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]); + + b[14] = a[14]; + b[15] = a[15]; + + b[16] = vaddq_s16(in[16], a[23]); + b[17] = vaddq_s16(in[17], a[22]); + b[18] = vaddq_s16(in[18], a[21]); + b[19] = vaddq_s16(in[19], a[20]); + + b[20] = vsubq_s16(in[19], a[20]); + b[21] = vsubq_s16(in[18], a[21]); + b[22] = vsubq_s16(in[17], a[22]); + b[23] = vsubq_s16(in[16], a[23]); + + b[24] = vsubq_s16(in[31], a[24]); + b[25] = vsubq_s16(in[30], a[25]); + b[26] = vsubq_s16(in[29], a[26]); + b[27] = vsubq_s16(in[28], a[27]); + + b[28] = vaddq_s16(in[28], a[27]); + b[29] = vaddq_s16(in[29], a[26]); + b[30] = vaddq_s16(in[30], a[25]); + b[31] = vaddq_s16(in[31], a[24]); + + // Stage 4. + a[0] = vaddq_s16(b[0], b[3]); + a[1] = vaddq_s16(b[1], b[2]); + a[2] = vsubq_s16(b[1], b[2]); + a[3] = vsubq_s16(b[0], b[3]); + + a[4] = b[4]; + + butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]); + + a[7] = b[7]; + + a[8] = vaddq_s16(b[8], b[11]); + a[9] = vaddq_s16(b[9], b[10]); + a[10] = vsubq_s16(b[9], b[10]); + a[11] = vsubq_s16(b[8], b[11]); + a[12] = vsubq_s16(b[15], b[12]); + a[13] = vsubq_s16(b[14], b[13]); + a[14] = vaddq_s16(b[14], b[13]); + a[15] = vaddq_s16(b[15], b[12]); + + a[16] = b[16]; + a[17] = b[17]; + + butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]); + butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]); + butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]); + butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]); + + a[22] = b[22]; + a[23] = b[23]; + a[24] = b[24]; + a[25] = b[25]; + + a[30] = b[30]; + a[31] = b[31]; + + // Stage 5. + butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]); + butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]); + + b[4] = vaddq_s16(a[4], a[5]); + b[5] = vsubq_s16(a[4], a[5]); + b[6] = vsubq_s16(a[7], a[6]); + b[7] = vaddq_s16(a[7], a[6]); + + b[8] = a[8]; + + butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]); + butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]); + + b[11] = a[11]; + b[12] = a[12]; + + b[15] = a[15]; + + b[16] = vaddq_s16(a[19], a[16]); + b[17] = vaddq_s16(a[18], a[17]); + b[18] = vsubq_s16(a[17], a[18]); + b[19] = vsubq_s16(a[16], a[19]); + b[20] = vsubq_s16(a[23], a[20]); + b[21] = vsubq_s16(a[22], a[21]); + b[22] = vaddq_s16(a[21], a[22]); + b[23] = vaddq_s16(a[20], a[23]); + b[24] = vaddq_s16(a[27], a[24]); + b[25] = vaddq_s16(a[26], a[25]); + b[26] = vsubq_s16(a[25], a[26]); + b[27] = vsubq_s16(a[24], a[27]); + b[28] = vsubq_s16(a[31], a[28]); + b[29] = vsubq_s16(a[30], a[29]); + b[30] = vaddq_s16(a[29], a[30]); + b[31] = vaddq_s16(a[28], a[31]); + + // Stage 6. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + + butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]); + butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]); + + a[8] = vaddq_s16(b[8], b[9]); + a[9] = vsubq_s16(b[8], b[9]); + a[10] = vsubq_s16(b[11], b[10]); + a[11] = vaddq_s16(b[11], b[10]); + a[12] = vaddq_s16(b[12], b[13]); + a[13] = vsubq_s16(b[12], b[13]); + a[14] = vsubq_s16(b[15], b[14]); + a[15] = vaddq_s16(b[15], b[14]); + + a[16] = b[16]; + a[19] = b[19]; + a[20] = b[20]; + a[23] = b[23]; + a[24] = b[24]; + a[27] = b[27]; + a[28] = b[28]; + a[31] = b[31]; + + butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]); + butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]); + + butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]); + butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]); + + // Stage 7. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + b[4] = a[4]; + b[5] = a[5]; + b[6] = a[6]; + b[7] = a[7]; + + butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]); + butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]); + butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]); + butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]); + + b[16] = vaddq_s16(a[16], a[17]); + b[17] = vsubq_s16(a[16], a[17]); + b[18] = vsubq_s16(a[19], a[18]); + b[19] = vaddq_s16(a[19], a[18]); + b[20] = vaddq_s16(a[20], a[21]); + b[21] = vsubq_s16(a[20], a[21]); + b[22] = vsubq_s16(a[23], a[22]); + b[23] = vaddq_s16(a[23], a[22]); + b[24] = vaddq_s16(a[24], a[25]); + b[25] = vsubq_s16(a[24], a[25]); + b[26] = vsubq_s16(a[27], a[26]); + b[27] = vaddq_s16(a[27], a[26]); + b[28] = vaddq_s16(a[28], a[29]); + b[29] = vsubq_s16(a[28], a[29]); + b[30] = vsubq_s16(a[31], a[30]); + b[31] = vaddq_s16(a[31], a[30]); + + // Final stage. + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[0] = sub_round_shift_s16(b[0]); + out[16] = sub_round_shift_s16(b[1]); + out[8] = sub_round_shift_s16(b[2]); + out[24] = sub_round_shift_s16(b[3]); + out[4] = sub_round_shift_s16(b[4]); + out[20] = sub_round_shift_s16(b[5]); + out[12] = sub_round_shift_s16(b[6]); + out[28] = sub_round_shift_s16(b[7]); + out[2] = sub_round_shift_s16(b[8]); + out[18] = sub_round_shift_s16(b[9]); + out[10] = sub_round_shift_s16(b[10]); + out[26] = sub_round_shift_s16(b[11]); + out[6] = sub_round_shift_s16(b[12]); + out[22] = sub_round_shift_s16(b[13]); + out[14] = sub_round_shift_s16(b[14]); + out[30] = sub_round_shift_s16(b[15]); + + butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]); + out[1] = sub_round_shift_s16(a[1]); + out[31] = sub_round_shift_s16(a[31]); + + butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]); + out[17] = sub_round_shift_s16(a[17]); + out[15] = sub_round_shift_s16(a[15]); + + butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]); + out[9] = sub_round_shift_s16(a[9]); + out[23] = sub_round_shift_s16(a[23]); + + butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]); + out[25] = sub_round_shift_s16(a[25]); + out[7] = sub_round_shift_s16(a[7]); + + butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]); + out[5] = sub_round_shift_s16(a[5]); + out[27] = sub_round_shift_s16(a[27]); + + butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]); + out[21] = sub_round_shift_s16(a[21]); + out[11] = sub_round_shift_s16(a[11]); + + butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]); + out[13] = sub_round_shift_s16(a[13]); + out[19] = sub_round_shift_s16(a[19]); + + butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]); + out[29] = sub_round_shift_s16(a[29]); + out[3] = sub_round_shift_s16(a[3]); +} + +#define PASS_THROUGH(src, dst, element) \ + do { \ + dst##_lo[element] = src##_lo[element]; \ + dst##_hi[element] = src##_hi[element]; \ + } while (0) + +#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ + do { \ + c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ + c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ + } while (0) + +#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ + do { \ + temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ + temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ + c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ + c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ + } while (0) + +#define ADD_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define SUB_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ + add_index, sub_index) \ + do { \ + butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ + &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ + sub_index) \ + do { \ + butterfly_one_coeff_s32_fast( \ + a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \ + a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ + right_constant, b, add_index, sub_index) \ + do { \ + butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ + a##_lo[right_index], a##_hi[right_index], \ + left_constant, right_constant, &b##_lo[add_index], \ + &b##_hi[add_index], &b##_lo[sub_index], \ + &b##_hi[sub_index]); \ + } while (0) + +static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + int32x4_t c_lo[32]; + int32x4_t c_hi[32]; + int32x4_t d_lo[32]; + int32x4_t d_hi[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + + b[16] = a[16]; + b[17] = a[17]; + b[18] = a[18]; + b[19] = a[19]; + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + + b[28] = a[28]; + b[29] = a[29]; + b[30] = a[30]; + b[31] = a[31]; + + // Stage 3. With extreme values for input this calculation rolls over int16_t. + // The sources for b[0] get added multiple times and, through testing, have + // been shown to overflow starting here. + ADD_S16_S32(b, 0, 7, c, 0); + ADD_S16_S32(b, 1, 6, c, 1); + ADD_S16_S32(b, 2, 5, c, 2); + ADD_S16_S32(b, 3, 4, c, 3); + SUB_S16_S32(b, 3, 4, c, 4); + SUB_S16_S32(b, 2, 5, c, 5); + SUB_S16_S32(b, 1, 6, c, 6); + SUB_S16_S32(b, 0, 7, c, 7); + + a[8] = b[8]; + a[9] = b[9]; + + BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); + BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); + + a[14] = b[14]; + a[15] = b[15]; + + ADD_S16_S32(b, 16, 23, c, 16); + ADD_S16_S32(b, 17, 22, c, 17); + ADD_S16_S32(b, 18, 21, c, 18); + ADD_S16_S32(b, 19, 20, c, 19); + SUB_S16_S32(b, 19, 20, c, 20); + SUB_S16_S32(b, 18, 21, c, 21); + SUB_S16_S32(b, 17, 22, c, 22); + SUB_S16_S32(b, 16, 23, c, 23); + SUB_S16_S32(b, 31, 24, c, 24); + SUB_S16_S32(b, 30, 25, c, 25); + SUB_S16_S32(b, 29, 26, c, 26); + SUB_S16_S32(b, 28, 27, c, 27); + ADD_S16_S32(b, 28, 27, c, 28); + ADD_S16_S32(b, 29, 26, c, 29); + ADD_S16_S32(b, 30, 25, c, 30); + ADD_S16_S32(b, 31, 24, c, 31); + + // Stage 4. + ADD_S32(c, 0, 3, d, 0); + ADD_S32(c, 1, 2, d, 1); + SUB_S32(c, 1, 2, d, 2); + SUB_S32(c, 0, 3, d, 3); + + PASS_THROUGH(c, d, 4); + + BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); + + PASS_THROUGH(c, d, 7); + + ADDW_S16_S32(c, 11, a, 8, d, 8); + ADDW_S16_S32(c, 10, a, 9, d, 9); + SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); + SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); + SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); + SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); + ADDW_S16_S32(c, 13, b, 14, d, 14); + ADDW_S16_S32(c, 12, b, 15, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 17); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19); + BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21); + + PASS_THROUGH(c, d, 22); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 25); + + PASS_THROUGH(c, d, 30); + PASS_THROUGH(c, d, 31); + + // Stage 5. + BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); + BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3); + + ADD_S32(d, 4, 5, c, 4); + SUB_S32(d, 4, 5, c, 5); + SUB_S32(d, 7, 6, c, 6); + ADD_S32(d, 7, 6, c, 7); + + PASS_THROUGH(d, c, 8); + + BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10); + + PASS_THROUGH(d, c, 11); + PASS_THROUGH(d, c, 12); + PASS_THROUGH(d, c, 15); + + ADD_S32(d, 16, 19, c, 16); + ADD_S32(d, 17, 18, c, 17); + SUB_S32(d, 17, 18, c, 18); + SUB_S32(d, 16, 19, c, 19); + SUB_S32(d, 23, 20, c, 20); + SUB_S32(d, 22, 21, c, 21); + ADD_S32(d, 22, 21, c, 22); + ADD_S32(d, 23, 20, c, 23); + ADD_S32(d, 24, 27, c, 24); + ADD_S32(d, 25, 26, c, 25); + SUB_S32(d, 25, 26, c, 26); + SUB_S32(d, 24, 27, c, 27); + SUB_S32(d, 31, 28, c, 28); + SUB_S32(d, 30, 29, c, 29); + ADD_S32(d, 30, 29, c, 30); + ADD_S32(d, 31, 28, c, 31); + + // Stage 6. + PASS_THROUGH(c, d, 0); + PASS_THROUGH(c, d, 1); + PASS_THROUGH(c, d, 2); + PASS_THROUGH(c, d, 3); + + BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7); + BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6); + + ADD_S32(c, 8, 9, d, 8); + SUB_S32(c, 8, 9, d, 9); + SUB_S32(c, 11, 10, d, 10); + ADD_S32(c, 11, 10, d, 11); + ADD_S32(c, 12, 13, d, 12); + SUB_S32(c, 12, 13, d, 13); + SUB_S32(c, 15, 14, d, 14); + ADD_S32(c, 15, 14, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 19); + PASS_THROUGH(c, d, 20); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 27); + PASS_THROUGH(c, d, 28); + PASS_THROUGH(c, d, 31); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17); + BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21); + BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22); + + // Stage 7. + PASS_THROUGH(d, c, 0); + PASS_THROUGH(d, c, 1); + PASS_THROUGH(d, c, 2); + PASS_THROUGH(d, c, 3); + PASS_THROUGH(d, c, 4); + PASS_THROUGH(d, c, 5); + PASS_THROUGH(d, c, 6); + PASS_THROUGH(d, c, 7); + + BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15); + BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13); + BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12); + + ADD_S32(d, 16, 17, c, 16); + SUB_S32(d, 16, 17, c, 17); + SUB_S32(d, 19, 18, c, 18); + ADD_S32(d, 19, 18, c, 19); + ADD_S32(d, 20, 21, c, 20); + SUB_S32(d, 20, 21, c, 21); + SUB_S32(d, 23, 22, c, 22); + ADD_S32(d, 23, 22, c, 23); + ADD_S32(d, 24, 25, c, 24); + SUB_S32(d, 24, 25, c, 25); + SUB_S32(d, 27, 26, c, 26); + ADD_S32(d, 27, 26, c, 27); + ADD_S32(d, 28, 29, c, 28); + SUB_S32(d, 28, 29, c, 29); + SUB_S32(d, 31, 30, c, 30); + ADD_S32(d, 31, 30, c, 31); + + // Final stage. + // Roll rounding into this function so we can pass back int16x8. + + out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]); + out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]); + + out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]); + out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]); + out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]); + out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]); + out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]); + + out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]); + out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]); + out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]); + out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]); + + out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]); + out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]); + out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]); + out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]); + out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]); + + BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31); + out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]); + out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15); + out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]); + out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23); + out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]); + out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]); + + BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7); + out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]); + out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]); + + BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27); + out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]); + out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]); + + BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11); + out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]); + out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]); + + BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19); + out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]); + out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]); + + BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3); + out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]); + out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]); +} + +static INLINE void dct_body_second_pass_rd(const int16x8_t *in, + int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); + b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); + b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); + b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); + b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); + b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); + b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); + b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); + + b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); + b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); + b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); + b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); + b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); + b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); + b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); + b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); + + b[16] = add_round_shift_s16(a[16]); + b[17] = add_round_shift_s16(a[17]); + b[18] = add_round_shift_s16(a[18]); + b[19] = add_round_shift_s16(a[19]); + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + b[20] = add_round_shift_s16(b[20]); + b[21] = add_round_shift_s16(b[21]); + b[22] = add_round_shift_s16(b[22]); + b[23] = add_round_shift_s16(b[23]); + b[24] = add_round_shift_s16(b[24]); + b[25] = add_round_shift_s16(b[25]); + b[26] = add_round_shift_s16(b[26]); + b[27] = add_round_shift_s16(b[27]); + + b[28] = add_round_shift_s16(a[28]); + b[29] = add_round_shift_s16(a[29]); + b[30] = add_round_shift_s16(a[30]); + b[31] = add_round_shift_s16(a[31]); + + // Stage 3. + a[0] = vaddq_s16(b[0], b[7]); + a[1] = vaddq_s16(b[1], b[6]); + a[2] = vaddq_s16(b[2], b[5]); + a[3] = vaddq_s16(b[3], b[4]); + + a[4] = vsubq_s16(b[3], b[4]); + a[5] = vsubq_s16(b[2], b[5]); + a[6] = vsubq_s16(b[1], b[6]); + a[7] = vsubq_s16(b[0], b[7]); + + a[8] = b[8]; + a[9] = b[9]; + + butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]); + butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]); + + a[14] = b[14]; + a[15] = b[15]; + + a[16] = vaddq_s16(b[16], b[23]); + a[17] = vaddq_s16(b[17], b[22]); + a[18] = vaddq_s16(b[18], b[21]); + a[19] = vaddq_s16(b[19], b[20]); + + a[20] = vsubq_s16(b[19], b[20]); + a[21] = vsubq_s16(b[18], b[21]); + a[22] = vsubq_s16(b[17], b[22]); + a[23] = vsubq_s16(b[16], b[23]); + + a[24] = vsubq_s16(b[31], b[24]); + a[25] = vsubq_s16(b[30], b[25]); + a[26] = vsubq_s16(b[29], b[26]); + a[27] = vsubq_s16(b[28], b[27]); + + a[28] = vaddq_s16(b[28], b[27]); + a[29] = vaddq_s16(b[29], b[26]); + a[30] = vaddq_s16(b[30], b[25]); + a[31] = vaddq_s16(b[31], b[24]); + + // Stage 4. + b[0] = vaddq_s16(a[0], a[3]); + b[1] = vaddq_s16(a[1], a[2]); + b[2] = vsubq_s16(a[1], a[2]); + b[3] = vsubq_s16(a[0], a[3]); + + b[4] = a[4]; + + butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]); + + b[7] = a[7]; + + b[8] = vaddq_s16(a[8], a[11]); + b[9] = vaddq_s16(a[9], a[10]); + b[10] = vsubq_s16(a[9], a[10]); + b[11] = vsubq_s16(a[8], a[11]); + b[12] = vsubq_s16(a[15], a[12]); + b[13] = vsubq_s16(a[14], a[13]); + b[14] = vaddq_s16(a[14], a[13]); + b[15] = vaddq_s16(a[15], a[12]); + + b[16] = a[16]; + b[17] = a[17]; + + butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]); + butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]); + butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]); + butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]); + + b[22] = a[22]; + b[23] = a[23]; + b[24] = a[24]; + b[25] = a[25]; + + b[30] = a[30]; + b[31] = a[31]; + + // Stage 5. + butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]); + butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]); + + a[4] = vaddq_s16(b[4], b[5]); + a[5] = vsubq_s16(b[4], b[5]); + a[6] = vsubq_s16(b[7], b[6]); + a[7] = vaddq_s16(b[7], b[6]); + + a[8] = b[8]; + + butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]); + butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]); + + a[11] = b[11]; + a[12] = b[12]; + + a[15] = b[15]; + + a[16] = vaddq_s16(b[19], b[16]); + a[17] = vaddq_s16(b[18], b[17]); + a[18] = vsubq_s16(b[17], b[18]); + a[19] = vsubq_s16(b[16], b[19]); + a[20] = vsubq_s16(b[23], b[20]); + a[21] = vsubq_s16(b[22], b[21]); + a[22] = vaddq_s16(b[21], b[22]); + a[23] = vaddq_s16(b[20], b[23]); + a[24] = vaddq_s16(b[27], b[24]); + a[25] = vaddq_s16(b[26], b[25]); + a[26] = vsubq_s16(b[25], b[26]); + a[27] = vsubq_s16(b[24], b[27]); + a[28] = vsubq_s16(b[31], b[28]); + a[29] = vsubq_s16(b[30], b[29]); + a[30] = vaddq_s16(b[29], b[30]); + a[31] = vaddq_s16(b[28], b[31]); + + // Stage 6. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + + butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]); + butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]); + + b[8] = vaddq_s16(a[8], a[9]); + b[9] = vsubq_s16(a[8], a[9]); + b[10] = vsubq_s16(a[11], a[10]); + b[11] = vaddq_s16(a[11], a[10]); + b[12] = vaddq_s16(a[12], a[13]); + b[13] = vsubq_s16(a[12], a[13]); + b[14] = vsubq_s16(a[15], a[14]); + b[15] = vaddq_s16(a[15], a[14]); + + b[16] = a[16]; + b[19] = a[19]; + b[20] = a[20]; + b[23] = a[23]; + b[24] = a[24]; + b[27] = a[27]; + b[28] = a[28]; + b[31] = a[31]; + + butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]); + butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]); + + butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]); + butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]); + + // Stage 7. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + a[4] = b[4]; + a[5] = b[5]; + a[6] = b[6]; + a[7] = b[7]; + + butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]); + butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]); + butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]); + butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]); + + a[16] = vaddq_s16(b[16], b[17]); + a[17] = vsubq_s16(b[16], b[17]); + a[18] = vsubq_s16(b[19], b[18]); + a[19] = vaddq_s16(b[19], b[18]); + a[20] = vaddq_s16(b[20], b[21]); + a[21] = vsubq_s16(b[20], b[21]); + a[22] = vsubq_s16(b[23], b[22]); + a[23] = vaddq_s16(b[23], b[22]); + a[24] = vaddq_s16(b[24], b[25]); + a[25] = vsubq_s16(b[24], b[25]); + a[26] = vsubq_s16(b[27], b[26]); + a[27] = vaddq_s16(b[27], b[26]); + a[28] = vaddq_s16(b[28], b[29]); + a[29] = vsubq_s16(b[28], b[29]); + a[30] = vsubq_s16(b[31], b[30]); + a[31] = vaddq_s16(b[31], b[30]); + + // Final stage. + out[0] = a[0]; + out[16] = a[1]; + out[8] = a[2]; + out[24] = a[3]; + out[4] = a[4]; + out[20] = a[5]; + out[12] = a[6]; + out[28] = a[7]; + out[2] = a[8]; + out[18] = a[9]; + out[10] = a[10]; + out[26] = a[11]; + out[6] = a[12]; + out[22] = a[13]; + out[14] = a[14]; + out[30] = a[15]; + + butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]); + butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17], + &out[15]); + butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]); + butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]); + butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]); + butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21], + &out[11]); + butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13], + &out[19]); + butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]); +} + +#undef PASS_THROUGH +#undef ADD_S16_S32 +#undef SUB_S16_S32 +#undef ADDW_S16_S32 +#undef SUBW_S16_S32 +#undef ADD_S32 +#undef SUB_S32 +#undef BUTTERFLY_ONE_S16_S32 +#undef BUTTERFLY_ONE_S32 +#undef BUTTERFLY_TWO_S32 + +#if CONFIG_VP9_HIGHBITDEPTH + +// Store 32 32x4 vectors, assuming stride == 32. +static INLINE void store32x32_s32( + tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/, + const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/, + const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/, + const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) { + int i; + for (i = 0; i < 32; i++) { + vst1q_s32(a, l1[i]); + vst1q_s32(a + 4, r1[i]); + vst1q_s32(a + 8, l2[i]); + vst1q_s32(a + 12, r2[i]); + vst1q_s32(a + 16, l3[i]); + vst1q_s32(a + 20, r3[i]); + vst1q_s32(a + 24, l4[i]); + vst1q_s32(a + 28, r4[i]); + a += 32; + } +} + +static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/, + int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); + left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); + left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); + left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); + left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); + left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); + left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); + left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); + left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + left[16] = vshll_n_s16(vget_low_s16(a[16]), 2); + left[17] = vshll_n_s16(vget_low_s16(a[17]), 2); + left[18] = vshll_n_s16(vget_low_s16(a[18]), 2); + left[19] = vshll_n_s16(vget_low_s16(a[19]), 2); + left[20] = vshll_n_s16(vget_low_s16(a[20]), 2); + left[21] = vshll_n_s16(vget_low_s16(a[21]), 2); + left[22] = vshll_n_s16(vget_low_s16(a[22]), 2); + left[23] = vshll_n_s16(vget_low_s16(a[23]), 2); + left[24] = vshll_n_s16(vget_low_s16(a[24]), 2); + left[25] = vshll_n_s16(vget_low_s16(a[25]), 2); + left[26] = vshll_n_s16(vget_low_s16(a[26]), 2); + left[27] = vshll_n_s16(vget_low_s16(a[27]), 2); + left[28] = vshll_n_s16(vget_low_s16(a[28]), 2); + left[29] = vshll_n_s16(vget_low_s16(a[29]), 2); + left[30] = vshll_n_s16(vget_low_s16(a[30]), 2); + left[31] = vshll_n_s16(vget_low_s16(a[31]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); + right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); + right[16] = vshll_n_s16(vget_high_s16(a[16]), 2); + right[17] = vshll_n_s16(vget_high_s16(a[17]), 2); + right[18] = vshll_n_s16(vget_high_s16(a[18]), 2); + right[19] = vshll_n_s16(vget_high_s16(a[19]), 2); + right[20] = vshll_n_s16(vget_high_s16(a[20]), 2); + right[21] = vshll_n_s16(vget_high_s16(a[21]), 2); + right[22] = vshll_n_s16(vget_high_s16(a[22]), 2); + right[23] = vshll_n_s16(vget_high_s16(a[23]), 2); + right[24] = vshll_n_s16(vget_high_s16(a[24]), 2); + right[25] = vshll_n_s16(vget_high_s16(a[25]), 2); + right[26] = vshll_n_s16(vget_high_s16(a[26]), 2); + right[27] = vshll_n_s16(vget_high_s16(a[27]), 2); + right[28] = vshll_n_s16(vget_high_s16(a[28]), 2); + right[29] = vshll_n_s16(vget_high_s16(a[29]), 2); + right[30] = vshll_n_s16(vget_high_s16(a[30]), 2); + right[31] = vshll_n_s16(vget_high_s16(a[31]), 2); +} + +static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/, + int32x4_t *a_right /*[32]*/, + int32x4_t *b_left /*[32]*/, + int32x4_t *b_right /*[32]*/) { + // Stage 1. Done as part of the load for the first pass. + b_left[0] = vaddq_s32(a_left[0], a_left[31]); + b_left[1] = vaddq_s32(a_left[1], a_left[30]); + b_left[2] = vaddq_s32(a_left[2], a_left[29]); + b_left[3] = vaddq_s32(a_left[3], a_left[28]); + b_left[4] = vaddq_s32(a_left[4], a_left[27]); + b_left[5] = vaddq_s32(a_left[5], a_left[26]); + b_left[6] = vaddq_s32(a_left[6], a_left[25]); + b_left[7] = vaddq_s32(a_left[7], a_left[24]); + b_left[8] = vaddq_s32(a_left[8], a_left[23]); + b_left[9] = vaddq_s32(a_left[9], a_left[22]); + b_left[10] = vaddq_s32(a_left[10], a_left[21]); + b_left[11] = vaddq_s32(a_left[11], a_left[20]); + b_left[12] = vaddq_s32(a_left[12], a_left[19]); + b_left[13] = vaddq_s32(a_left[13], a_left[18]); + b_left[14] = vaddq_s32(a_left[14], a_left[17]); + b_left[15] = vaddq_s32(a_left[15], a_left[16]); + + b_right[0] = vaddq_s32(a_right[0], a_right[31]); + b_right[1] = vaddq_s32(a_right[1], a_right[30]); + b_right[2] = vaddq_s32(a_right[2], a_right[29]); + b_right[3] = vaddq_s32(a_right[3], a_right[28]); + b_right[4] = vaddq_s32(a_right[4], a_right[27]); + b_right[5] = vaddq_s32(a_right[5], a_right[26]); + b_right[6] = vaddq_s32(a_right[6], a_right[25]); + b_right[7] = vaddq_s32(a_right[7], a_right[24]); + b_right[8] = vaddq_s32(a_right[8], a_right[23]); + b_right[9] = vaddq_s32(a_right[9], a_right[22]); + b_right[10] = vaddq_s32(a_right[10], a_right[21]); + b_right[11] = vaddq_s32(a_right[11], a_right[20]); + b_right[12] = vaddq_s32(a_right[12], a_right[19]); + b_right[13] = vaddq_s32(a_right[13], a_right[18]); + b_right[14] = vaddq_s32(a_right[14], a_right[17]); + b_right[15] = vaddq_s32(a_right[15], a_right[16]); + + b_left[16] = vsubq_s32(a_left[15], a_left[16]); + b_left[17] = vsubq_s32(a_left[14], a_left[17]); + b_left[18] = vsubq_s32(a_left[13], a_left[18]); + b_left[19] = vsubq_s32(a_left[12], a_left[19]); + b_left[20] = vsubq_s32(a_left[11], a_left[20]); + b_left[21] = vsubq_s32(a_left[10], a_left[21]); + b_left[22] = vsubq_s32(a_left[9], a_left[22]); + b_left[23] = vsubq_s32(a_left[8], a_left[23]); + b_left[24] = vsubq_s32(a_left[7], a_left[24]); + b_left[25] = vsubq_s32(a_left[6], a_left[25]); + b_left[26] = vsubq_s32(a_left[5], a_left[26]); + b_left[27] = vsubq_s32(a_left[4], a_left[27]); + b_left[28] = vsubq_s32(a_left[3], a_left[28]); + b_left[29] = vsubq_s32(a_left[2], a_left[29]); + b_left[30] = vsubq_s32(a_left[1], a_left[30]); + b_left[31] = vsubq_s32(a_left[0], a_left[31]); + + b_right[16] = vsubq_s32(a_right[15], a_right[16]); + b_right[17] = vsubq_s32(a_right[14], a_right[17]); + b_right[18] = vsubq_s32(a_right[13], a_right[18]); + b_right[19] = vsubq_s32(a_right[12], a_right[19]); + b_right[20] = vsubq_s32(a_right[11], a_right[20]); + b_right[21] = vsubq_s32(a_right[10], a_right[21]); + b_right[22] = vsubq_s32(a_right[9], a_right[22]); + b_right[23] = vsubq_s32(a_right[8], a_right[23]); + b_right[24] = vsubq_s32(a_right[7], a_right[24]); + b_right[25] = vsubq_s32(a_right[6], a_right[25]); + b_right[26] = vsubq_s32(a_right[5], a_right[26]); + b_right[27] = vsubq_s32(a_right[4], a_right[27]); + b_right[28] = vsubq_s32(a_right[3], a_right[28]); + b_right[29] = vsubq_s32(a_right[2], a_right[29]); + b_right[30] = vsubq_s32(a_right[1], a_right[30]); + b_right[31] = vsubq_s32(a_right[0], a_right[31]); +} + +static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + + left[0] = add_round_shift_s32(left[0]); + left[1] = add_round_shift_s32(left[1]); + left[2] = add_round_shift_s32(left[2]); + left[3] = add_round_shift_s32(left[3]); + left[4] = add_round_shift_s32(left[4]); + left[5] = add_round_shift_s32(left[5]); + left[6] = add_round_shift_s32(left[6]); + left[7] = add_round_shift_s32(left[7]); + left[8] = add_round_shift_s32(left[8]); + left[9] = add_round_shift_s32(left[9]); + left[10] = add_round_shift_s32(left[10]); + left[11] = add_round_shift_s32(left[11]); + left[12] = add_round_shift_s32(left[12]); + left[13] = add_round_shift_s32(left[13]); + left[14] = add_round_shift_s32(left[14]); + left[15] = add_round_shift_s32(left[15]); + left[16] = add_round_shift_s32(left[16]); + left[17] = add_round_shift_s32(left[17]); + left[18] = add_round_shift_s32(left[18]); + left[19] = add_round_shift_s32(left[19]); + left[20] = add_round_shift_s32(left[20]); + left[21] = add_round_shift_s32(left[21]); + left[22] = add_round_shift_s32(left[22]); + left[23] = add_round_shift_s32(left[23]); + left[24] = add_round_shift_s32(left[24]); + left[25] = add_round_shift_s32(left[25]); + left[26] = add_round_shift_s32(left[26]); + left[27] = add_round_shift_s32(left[27]); + left[28] = add_round_shift_s32(left[28]); + left[29] = add_round_shift_s32(left[29]); + left[30] = add_round_shift_s32(left[30]); + left[31] = add_round_shift_s32(left[31]); + + right[0] = add_round_shift_s32(right[0]); + right[1] = add_round_shift_s32(right[1]); + right[2] = add_round_shift_s32(right[2]); + right[3] = add_round_shift_s32(right[3]); + right[4] = add_round_shift_s32(right[4]); + right[5] = add_round_shift_s32(right[5]); + right[6] = add_round_shift_s32(right[6]); + right[7] = add_round_shift_s32(right[7]); + right[8] = add_round_shift_s32(right[8]); + right[9] = add_round_shift_s32(right[9]); + right[10] = add_round_shift_s32(right[10]); + right[11] = add_round_shift_s32(right[11]); + right[12] = add_round_shift_s32(right[12]); + right[13] = add_round_shift_s32(right[13]); + right[14] = add_round_shift_s32(right[14]); + right[15] = add_round_shift_s32(right[15]); + right[16] = add_round_shift_s32(right[16]); + right[17] = add_round_shift_s32(right[17]); + right[18] = add_round_shift_s32(right[18]); + right[19] = add_round_shift_s32(right[19]); + right[20] = add_round_shift_s32(right[20]); + right[21] = add_round_shift_s32(right[21]); + right[22] = add_round_shift_s32(right[22]); + right[23] = add_round_shift_s32(right[23]); + right[24] = add_round_shift_s32(right[24]); + right[25] = add_round_shift_s32(right[25]); + right[26] = add_round_shift_s32(right[26]); + right[27] = add_round_shift_s32(right[27]); + right[28] = add_round_shift_s32(right[28]); + right[29] = add_round_shift_s32(right[29]); + right[30] = add_round_shift_s32(right[30]); + right[31] = add_round_shift_s32(right[31]); +} + +static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + + left[0] = sub_round_shift_s32(left[0]); + left[1] = sub_round_shift_s32(left[1]); + left[2] = sub_round_shift_s32(left[2]); + left[3] = sub_round_shift_s32(left[3]); + left[4] = sub_round_shift_s32(left[4]); + left[5] = sub_round_shift_s32(left[5]); + left[6] = sub_round_shift_s32(left[6]); + left[7] = sub_round_shift_s32(left[7]); + left[8] = sub_round_shift_s32(left[8]); + left[9] = sub_round_shift_s32(left[9]); + left[10] = sub_round_shift_s32(left[10]); + left[11] = sub_round_shift_s32(left[11]); + left[12] = sub_round_shift_s32(left[12]); + left[13] = sub_round_shift_s32(left[13]); + left[14] = sub_round_shift_s32(left[14]); + left[15] = sub_round_shift_s32(left[15]); + left[16] = sub_round_shift_s32(left[16]); + left[17] = sub_round_shift_s32(left[17]); + left[18] = sub_round_shift_s32(left[18]); + left[19] = sub_round_shift_s32(left[19]); + left[20] = sub_round_shift_s32(left[20]); + left[21] = sub_round_shift_s32(left[21]); + left[22] = sub_round_shift_s32(left[22]); + left[23] = sub_round_shift_s32(left[23]); + left[24] = sub_round_shift_s32(left[24]); + left[25] = sub_round_shift_s32(left[25]); + left[26] = sub_round_shift_s32(left[26]); + left[27] = sub_round_shift_s32(left[27]); + left[28] = sub_round_shift_s32(left[28]); + left[29] = sub_round_shift_s32(left[29]); + left[30] = sub_round_shift_s32(left[30]); + left[31] = sub_round_shift_s32(left[31]); + + right[0] = sub_round_shift_s32(right[0]); + right[1] = sub_round_shift_s32(right[1]); + right[2] = sub_round_shift_s32(right[2]); + right[3] = sub_round_shift_s32(right[3]); + right[4] = sub_round_shift_s32(right[4]); + right[5] = sub_round_shift_s32(right[5]); + right[6] = sub_round_shift_s32(right[6]); + right[7] = sub_round_shift_s32(right[7]); + right[8] = sub_round_shift_s32(right[8]); + right[9] = sub_round_shift_s32(right[9]); + right[10] = sub_round_shift_s32(right[10]); + right[11] = sub_round_shift_s32(right[11]); + right[12] = sub_round_shift_s32(right[12]); + right[13] = sub_round_shift_s32(right[13]); + right[14] = sub_round_shift_s32(right[14]); + right[15] = sub_round_shift_s32(right[15]); + right[16] = sub_round_shift_s32(right[16]); + right[17] = sub_round_shift_s32(right[17]); + right[18] = sub_round_shift_s32(right[18]); + right[19] = sub_round_shift_s32(right[19]); + right[20] = sub_round_shift_s32(right[20]); + right[21] = sub_round_shift_s32(right[21]); + right[22] = sub_round_shift_s32(right[22]); + right[23] = sub_round_shift_s32(right[23]); + right[24] = sub_round_shift_s32(right[24]); + right[25] = sub_round_shift_s32(right[25]); + right[26] = sub_round_shift_s32(right[26]); + right[27] = sub_round_shift_s32(right[27]); + right[28] = sub_round_shift_s32(right[28]); + right[29] = sub_round_shift_s32(right[29]); + right[30] = sub_round_shift_s32(right[30]); + right[31] = sub_round_shift_s32(right[31]); +} + +static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + al[0] = vaddq_s32(left[0], left[15]); + ar[0] = vaddq_s32(right[0], right[15]); + al[1] = vaddq_s32(left[1], left[14]); + ar[1] = vaddq_s32(right[1], right[14]); + al[2] = vaddq_s32(left[2], left[13]); + ar[2] = vaddq_s32(right[2], right[13]); + al[3] = vaddq_s32(left[3], left[12]); + ar[3] = vaddq_s32(right[3], right[12]); + al[4] = vaddq_s32(left[4], left[11]); + ar[4] = vaddq_s32(right[4], right[11]); + al[5] = vaddq_s32(left[5], left[10]); + ar[5] = vaddq_s32(right[5], right[10]); + al[6] = vaddq_s32(left[6], left[9]); + ar[6] = vaddq_s32(right[6], right[9]); + al[7] = vaddq_s32(left[7], left[8]); + ar[7] = vaddq_s32(right[7], right[8]); + + al[8] = vsubq_s32(left[7], left[8]); + ar[8] = vsubq_s32(right[7], right[8]); + al[9] = vsubq_s32(left[6], left[9]); + ar[9] = vsubq_s32(right[6], right[9]); + al[10] = vsubq_s32(left[5], left[10]); + ar[10] = vsubq_s32(right[5], right[10]); + al[11] = vsubq_s32(left[4], left[11]); + ar[11] = vsubq_s32(right[4], right[11]); + al[12] = vsubq_s32(left[3], left[12]); + ar[12] = vsubq_s32(right[3], right[12]); + al[13] = vsubq_s32(left[2], left[13]); + ar[13] = vsubq_s32(right[2], right[13]); + al[14] = vsubq_s32(left[1], left[14]); + ar[14] = vsubq_s32(right[1], right[14]); + al[15] = vsubq_s32(left[0], left[15]); + ar[15] = vsubq_s32(right[0], right[15]); + + al[16] = left[16]; + ar[16] = right[16]; + al[17] = left[17]; + ar[17] = right[17]; + al[18] = left[18]; + ar[18] = right[18]; + al[19] = left[19]; + ar[19] = right[19]; + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[28] = left[28]; + ar[28] = right[28]; + al[29] = left[29]; + ar[29] = right[29]; + al[30] = left[30]; + ar[30] = right[30]; + al[31] = left[31]; + ar[31] = right[31]; + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(left[16], al[23]); + br[16] = vaddq_s32(right[16], ar[23]); + bl[17] = vaddq_s32(left[17], al[22]); + br[17] = vaddq_s32(right[17], ar[22]); + bl[18] = vaddq_s32(left[18], al[21]); + br[18] = vaddq_s32(right[18], ar[21]); + bl[19] = vaddq_s32(left[19], al[20]); + br[19] = vaddq_s32(right[19], ar[20]); + + bl[20] = vsubq_s32(left[19], al[20]); + br[20] = vsubq_s32(right[19], ar[20]); + bl[21] = vsubq_s32(left[18], al[21]); + br[21] = vsubq_s32(right[18], ar[21]); + bl[22] = vsubq_s32(left[17], al[22]); + br[22] = vsubq_s32(right[17], ar[22]); + bl[23] = vsubq_s32(left[16], al[23]); + br[23] = vsubq_s32(right[16], ar[23]); + + bl[24] = vsubq_s32(left[31], al[24]); + br[24] = vsubq_s32(right[31], ar[24]); + bl[25] = vsubq_s32(left[30], al[25]); + br[25] = vsubq_s32(right[30], ar[25]); + bl[26] = vsubq_s32(left[29], al[26]); + br[26] = vsubq_s32(right[29], ar[26]); + bl[27] = vsubq_s32(left[28], al[27]); + br[27] = vsubq_s32(right[28], ar[27]); + + bl[28] = vaddq_s32(left[28], al[27]); + br[28] = vaddq_s32(right[28], ar[27]); + bl[29] = vaddq_s32(left[29], al[26]); + br[29] = vaddq_s32(right[29], ar[26]); + bl[30] = vaddq_s32(left[30], al[25]); + br[30] = vaddq_s32(right[30], ar[25]); + bl[31] = vaddq_s32(left[31], al[24]); + br[31] = vaddq_s32(right[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], + &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], + &ar[19]); + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], + cospi_24_64, -cospi_8_64, &al[27], &ar[27], + &al[20], &ar[20]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_24_64, -cospi_8_64, &al[26], &ar[26], + &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64, + cospi_24_64, &bl[2], &br[2], &bl[3], + &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64, + cospi_24_64, &bl[14], &br[14], &bl[9], + &br[9]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_24_64, -cospi_8_64, &bl[13], &br[13], + &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64, + cospi_28_64, &al[4], &ar[4], &al[7], + &ar[7]); + butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64, + cospi_12_64, &al[5], &ar[5], &al[6], + &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], + &ar[17]); + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], + cospi_28_64, -cospi_4_64, &al[29], &ar[29], + &al[18], &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_20_64, cospi_12_64, &al[26], &ar[26], + &al[21], &ar[21]); + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_12_64, -cospi_20_64, &al[25], + &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64, + cospi_30_64, &bl[8], &br[8], &bl[15], + &br[15]); + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], + &br[14]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_10_64, cospi_22_64, &bl[10], &br[10], + &bl[13], &br[13]); + butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11], + cospi_26_64, cospi_6_64, &bl[11], &br[11], + &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], + &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], + cospi_17_64, cospi_15_64, &al[17], &ar[17], + &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], + &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], + cospi_25_64, cospi_7_64, &al[25], &ar[25], + &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], + &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_21_64, cospi_11_64, &al[21], &ar[21], + &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_13_64, cospi_19_64, &al[13], &ar[13], + &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23], + cospi_29_64, cospi_3_64, &al[29], &ar[29], + &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + al[0] = vaddq_s32(left[0], left[15]); + ar[0] = vaddq_s32(right[0], right[15]); + al[1] = vaddq_s32(left[1], left[14]); + ar[1] = vaddq_s32(right[1], right[14]); + al[2] = vaddq_s32(left[2], left[13]); + ar[2] = vaddq_s32(right[2], right[13]); + al[3] = vaddq_s32(left[3], left[12]); + ar[3] = vaddq_s32(right[3], right[12]); + al[4] = vaddq_s32(left[4], left[11]); + ar[4] = vaddq_s32(right[4], right[11]); + al[5] = vaddq_s32(left[5], left[10]); + ar[5] = vaddq_s32(right[5], right[10]); + al[6] = vaddq_s32(left[6], left[9]); + ar[6] = vaddq_s32(right[6], right[9]); + al[7] = vaddq_s32(left[7], left[8]); + ar[7] = vaddq_s32(right[7], right[8]); + + al[8] = vsubq_s32(left[7], left[8]); + ar[8] = vsubq_s32(right[7], right[8]); + al[9] = vsubq_s32(left[6], left[9]); + ar[9] = vsubq_s32(right[6], right[9]); + al[10] = vsubq_s32(left[5], left[10]); + ar[10] = vsubq_s32(right[5], right[10]); + al[11] = vsubq_s32(left[4], left[11]); + ar[11] = vsubq_s32(right[4], right[11]); + al[12] = vsubq_s32(left[3], left[12]); + ar[12] = vsubq_s32(right[3], right[12]); + al[13] = vsubq_s32(left[2], left[13]); + ar[13] = vsubq_s32(right[2], right[13]); + al[14] = vsubq_s32(left[1], left[14]); + ar[14] = vsubq_s32(right[1], right[14]); + al[15] = vsubq_s32(left[0], left[15]); + ar[15] = vsubq_s32(right[0], right[15]); + + al[16] = left[16]; + ar[16] = right[16]; + al[17] = left[17]; + ar[17] = right[17]; + al[18] = left[18]; + ar[18] = right[18]; + al[19] = left[19]; + ar[19] = right[19]; + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[28] = left[28]; + ar[28] = right[28]; + al[29] = left[29]; + ar[29] = right[29]; + al[30] = left[30]; + ar[30] = right[30]; + al[31] = left[31]; + ar[31] = right[31]; + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(left[16], al[23]); + br[16] = vaddq_s32(right[16], ar[23]); + bl[17] = vaddq_s32(left[17], al[22]); + br[17] = vaddq_s32(right[17], ar[22]); + bl[18] = vaddq_s32(left[18], al[21]); + br[18] = vaddq_s32(right[18], ar[21]); + bl[19] = vaddq_s32(left[19], al[20]); + br[19] = vaddq_s32(right[19], ar[20]); + + bl[20] = vsubq_s32(left[19], al[20]); + br[20] = vsubq_s32(right[19], ar[20]); + bl[21] = vsubq_s32(left[18], al[21]); + br[21] = vsubq_s32(right[18], ar[21]); + bl[22] = vsubq_s32(left[17], al[22]); + br[22] = vsubq_s32(right[17], ar[22]); + bl[23] = vsubq_s32(left[16], al[23]); + br[23] = vsubq_s32(right[16], ar[23]); + + bl[24] = vsubq_s32(left[31], al[24]); + br[24] = vsubq_s32(right[31], ar[24]); + bl[25] = vsubq_s32(left[30], al[25]); + br[25] = vsubq_s32(right[30], ar[25]); + bl[26] = vsubq_s32(left[29], al[26]); + br[26] = vsubq_s32(right[29], ar[26]); + bl[27] = vsubq_s32(left[28], al[27]); + br[27] = vsubq_s32(right[28], ar[27]); + + bl[28] = vaddq_s32(left[28], al[27]); + br[28] = vaddq_s32(right[28], ar[27]); + bl[29] = vaddq_s32(left[29], al[26]); + br[29] = vaddq_s32(right[29], ar[26]); + bl[30] = vaddq_s32(left[30], al[25]); + br[30] = vaddq_s32(right[30], ar[25]); + bl[31] = vaddq_s32(left[31], al[24]); + br[31] = vaddq_s32(right[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], + &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], + &ar[19]); + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], + cospi_24_64, -cospi_8_64, &al[27], &ar[27], + &al[20], &ar[20]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_24_64, -cospi_8_64, &al[26], &ar[26], + &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64, + cospi_24_64, &bl[2], &br[2], &bl[3], + &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64, + cospi_24_64, &bl[14], &br[14], &bl[9], + &br[9]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_24_64, -cospi_8_64, &bl[13], &br[13], + &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64, + cospi_28_64, &al[4], &ar[4], &al[7], + &ar[7]); + butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64, + cospi_12_64, &al[5], &ar[5], &al[6], + &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], + &ar[17]); + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], + cospi_28_64, -cospi_4_64, &al[29], &ar[29], + &al[18], &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_20_64, cospi_12_64, &al[26], &ar[26], + &al[21], &ar[21]); + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_12_64, -cospi_20_64, &al[25], + &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64, + cospi_30_64, &bl[8], &br[8], &bl[15], + &br[15]); + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], + &br[14]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_10_64, cospi_22_64, &bl[10], &br[10], + &bl[13], &br[13]); + butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11], + cospi_26_64, cospi_6_64, &bl[11], &br[11], + &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], + &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], + cospi_17_64, cospi_15_64, &al[17], &ar[17], + &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], + &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], + cospi_25_64, cospi_7_64, &al[25], &ar[25], + &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], + &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_21_64, cospi_11_64, &al[21], &ar[21], + &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_13_64, cospi_19_64, &al[13], &ar[13], + &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23], + cospi_29_64, cospi_3_64, &al[29], &ar[29], + &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15])); + ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15])); + al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14])); + ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14])); + al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13])); + ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13])); + al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12])); + ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12])); + al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11])); + ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11])); + al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10])); + ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10])); + al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9])); + ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9])); + al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8])); + ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8])); + + al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8])); + ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8])); + al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9])); + ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9])); + al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10])); + ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10])); + al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11])); + ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11])); + al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12])); + ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12])); + al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13])); + ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13])); + al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14])); + ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14])); + al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15])); + ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15])); + + al[16] = add_round_shift_s32(left[16]); + ar[16] = add_round_shift_s32(right[16]); + al[17] = add_round_shift_s32(left[17]); + ar[17] = add_round_shift_s32(right[17]); + al[18] = add_round_shift_s32(left[18]); + ar[18] = add_round_shift_s32(right[18]); + al[19] = add_round_shift_s32(left[19]); + ar[19] = add_round_shift_s32(right[19]); + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[20] = add_round_shift_s32(al[20]); + ar[20] = add_round_shift_s32(ar[20]); + al[21] = add_round_shift_s32(al[21]); + ar[21] = add_round_shift_s32(ar[21]); + al[22] = add_round_shift_s32(al[22]); + ar[22] = add_round_shift_s32(ar[22]); + al[23] = add_round_shift_s32(al[23]); + ar[23] = add_round_shift_s32(ar[23]); + al[24] = add_round_shift_s32(al[24]); + ar[24] = add_round_shift_s32(ar[24]); + al[25] = add_round_shift_s32(al[25]); + ar[25] = add_round_shift_s32(ar[25]); + al[26] = add_round_shift_s32(al[26]); + ar[26] = add_round_shift_s32(ar[26]); + al[27] = add_round_shift_s32(al[27]); + ar[27] = add_round_shift_s32(ar[27]); + + al[28] = add_round_shift_s32(left[28]); + ar[28] = add_round_shift_s32(right[28]); + al[29] = add_round_shift_s32(left[29]); + ar[29] = add_round_shift_s32(right[29]); + al[30] = add_round_shift_s32(left[30]); + ar[30] = add_round_shift_s32(right[30]); + al[31] = add_round_shift_s32(left[31]); + ar[31] = add_round_shift_s32(right[31]); + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[16], al[23]); + br[16] = vaddq_s32(ar[16], ar[23]); + bl[17] = vaddq_s32(al[17], al[22]); + br[17] = vaddq_s32(ar[17], ar[22]); + bl[18] = vaddq_s32(al[18], al[21]); + br[18] = vaddq_s32(ar[18], ar[21]); + bl[19] = vaddq_s32(al[19], al[20]); + br[19] = vaddq_s32(ar[19], ar[20]); + + bl[20] = vsubq_s32(al[19], al[20]); + br[20] = vsubq_s32(ar[19], ar[20]); + bl[21] = vsubq_s32(al[18], al[21]); + br[21] = vsubq_s32(ar[18], ar[21]); + bl[22] = vsubq_s32(al[17], al[22]); + br[22] = vsubq_s32(ar[17], ar[22]); + bl[23] = vsubq_s32(al[16], al[23]); + br[23] = vsubq_s32(ar[16], ar[23]); + + bl[24] = vsubq_s32(al[31], al[24]); + br[24] = vsubq_s32(ar[31], ar[24]); + bl[25] = vsubq_s32(al[30], al[25]); + br[25] = vsubq_s32(ar[30], ar[25]); + bl[26] = vsubq_s32(al[29], al[26]); + br[26] = vsubq_s32(ar[29], ar[26]); + bl[27] = vsubq_s32(al[28], al[27]); + br[27] = vsubq_s32(ar[28], ar[27]); + + bl[28] = vaddq_s32(al[28], al[27]); + br[28] = vaddq_s32(ar[28], ar[27]); + bl[29] = vaddq_s32(al[29], al[26]); + br[29] = vaddq_s32(ar[29], ar[26]); + bl[30] = vaddq_s32(al[30], al[25]); + br[30] = vaddq_s32(ar[30], ar[25]); + bl[31] = vaddq_s32(al[31], al[24]); + br[31] = vaddq_s32(ar[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]); + butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]); + butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64, + -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64, + -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64, + &bl[2], &br[2], &bl[3], &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64, + &bl[14], &br[14], &bl[9], &br[9]); + butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64, + -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64, + &al[4], &ar[4], &al[7], &ar[7]); + butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64, + &al[5], &ar[5], &al[6], &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]); + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64, + -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]); + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64, + cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64, + -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64, + &bl[8], &br[8], &bl[15], &br[15]); + butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]); + butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64, + cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]); + butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64, + cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64, + cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64, + cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64, + cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64, + cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64, + cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/fdct_neon.c b/libvpx/vpx_dsp/arm/fdct4x4_neon.c index 2827791f1..3b9196fae 100644 --- a/libvpx/vpx_dsp/arm/fdct_neon.c +++ b/libvpx/vpx_dsp/arm/fdct4x4_neon.c @@ -18,10 +18,10 @@ #include "vpx_dsp/arm/fdct_neon.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { - int i; // input[M * stride] * 16 int16x4_t in[4]; in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4); @@ -34,9 +34,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); in[0] = vadd_s16(in[0], one); } - for (i = 0; i < 2; ++i) { - vpx_fdct4x4_pass1_neon(in); - } + vpx_fdct4x4_pass1_neon(in); + vpx_fdct4x4_pass2_neon(in); { // Not quite a rounding shift. Only add 1 despite shifting by 2. const int16x8_t one = vdupq_n_s16(1); @@ -48,3 +47,39 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, store_s16q_to_tran_low(final_output + 1 * 8, out_23); } } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + static const int32x4_t const_1000 = { 1, 0, 0, 0 }; + const int32x4_t const_one = vdupq_n_s32(1); + + // input[M * stride] * 16 + int32x4_t in[4]; + in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4); + + // If the very first value != 0, then add 1. + if (input[0] != 0) { + in[0] = vaddq_s32(in[0], const_1000); + } + + vpx_highbd_fdct4x4_pass1_neon(in); + vpx_highbd_fdct4x4_pass1_neon(in); + { + // Not quite a rounding shift. Only add 1 despite shifting by 2. + in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2); + in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2); + in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2); + in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2); + + vst1q_s32(final_output, in[0]); + vst1q_s32(final_output + 4, in[1]); + vst1q_s32(final_output + 8, in[2]); + vst1q_s32(final_output + 12, in[3]); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/arm/fdct4x4_neon.h b/libvpx/vpx_dsp/arm/fdct4x4_neon.h new file mode 100644 index 000000000..de3db9774 --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct4x4_neon.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ + +#include <arm_neon.h> + +static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0], + &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { + int32x4_t out[4]; + // in_0 +/- in_3, in_1 +/- in_2 + const int32x4_t s_0 = vaddq_s32(in[0], in[3]); + const int32x4_t s_1 = vaddq_s32(in[1], in[2]); + const int32x4_t s_2 = vsubq_s32(in[1], in[2]); + const int32x4_t s_3 = vsubq_s32(in[0], in[3]); + + butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64 + // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64, + &out[1], &out[3]); + + transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/fdct8x8_neon.c b/libvpx/vpx_dsp/arm/fdct8x8_neon.c new file mode 100644 index 000000000..75ee6f223 --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct8x8_neon.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" + +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // stage 1 + int16x8_t in[8]; + in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + + vpx_fdct8x8_pass1_neon(in); + vpx_fdct8x8_pass2_neon(in); + { + // from vpx_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15); + const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15); + const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15); + const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15); + const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15); + const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15); + const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15); + const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15); + in[0] = vhsubq_s16(in[0], sign_in0); + in[1] = vhsubq_s16(in[1], sign_in1); + in[2] = vhsubq_s16(in[2], sign_in2); + in[3] = vhsubq_s16(in[3], sign_in3); + in[4] = vhsubq_s16(in[4], sign_in4); + in[5] = vhsubq_s16(in[5], sign_in5); + in[6] = vhsubq_s16(in[6], sign_in6); + in[7] = vhsubq_s16(in[7], sign_in7); + // store results + store_s16q_to_tran_low(final_output + 0 * 8, in[0]); + store_s16q_to_tran_low(final_output + 1 * 8, in[1]); + store_s16q_to_tran_low(final_output + 2 * 8, in[2]); + store_s16q_to_tran_low(final_output + 3 * 8, in[3]); + store_s16q_to_tran_low(final_output + 4 * 8, in[4]); + store_s16q_to_tran_low(final_output + 5 * 8, in[5]); + store_s16q_to_tran_low(final_output + 6 * 8, in[6]); + store_s16q_to_tran_low(final_output + 7 * 8, in[7]); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // input[M * stride] * 16 + int32x4_t left[8], right[8]; + int16x8_t in[8]; + in[0] = vld1q_s16(input + 0 * stride); + in[1] = vld1q_s16(input + 1 * stride); + in[2] = vld1q_s16(input + 2 * stride); + in[3] = vld1q_s16(input + 3 * stride); + in[4] = vld1q_s16(input + 4 * stride); + in[5] = vld1q_s16(input + 5 * stride); + in[6] = vld1q_s16(input + 6 * stride); + in[7] = vld1q_s16(input + 7 * stride); + + left[0] = vshll_n_s16(vget_low_s16(in[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(in[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(in[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(in[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(in[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(in[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(in[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(in[7]), 2); + right[0] = vshll_n_s16(vget_high_s16(in[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(in[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(in[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(in[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(in[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(in[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(in[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(in[7]), 2); + + vpx_highbd_fdct8x8_pass1_neon(left, right); + vpx_highbd_fdct8x8_pass2_neon(left, right); + { + left[0] = add_round_shift_half_s32(left[0]); + left[1] = add_round_shift_half_s32(left[1]); + left[2] = add_round_shift_half_s32(left[2]); + left[3] = add_round_shift_half_s32(left[3]); + left[4] = add_round_shift_half_s32(left[4]); + left[5] = add_round_shift_half_s32(left[5]); + left[6] = add_round_shift_half_s32(left[6]); + left[7] = add_round_shift_half_s32(left[7]); + right[0] = add_round_shift_half_s32(right[0]); + right[1] = add_round_shift_half_s32(right[1]); + right[2] = add_round_shift_half_s32(right[2]); + right[3] = add_round_shift_half_s32(right[3]); + right[4] = add_round_shift_half_s32(right[4]); + right[5] = add_round_shift_half_s32(right[5]); + right[6] = add_round_shift_half_s32(right[6]); + right[7] = add_round_shift_half_s32(right[7]); + + // store results + vst1q_s32(final_output, left[0]); + vst1q_s32(final_output + 4, right[0]); + vst1q_s32(final_output + 8, left[1]); + vst1q_s32(final_output + 12, right[1]); + vst1q_s32(final_output + 16, left[2]); + vst1q_s32(final_output + 20, right[2]); + vst1q_s32(final_output + 24, left[3]); + vst1q_s32(final_output + 28, right[3]); + vst1q_s32(final_output + 32, left[4]); + vst1q_s32(final_output + 36, right[4]); + vst1q_s32(final_output + 40, left[5]); + vst1q_s32(final_output + 44, right[5]); + vst1q_s32(final_output + 48, left[6]); + vst1q_s32(final_output + 52, right[6]); + vst1q_s32(final_output + 56, left[7]); + vst1q_s32(final_output + 60, right[7]); + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/arm/fdct8x8_neon.h b/libvpx/vpx_dsp/arm/fdct8x8_neon.h new file mode 100644 index 000000000..d8fa60044 --- /dev/null +++ b/libvpx/vpx_dsp/arm/fdct8x8_neon.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ + +#include <arm_neon.h> + +static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1], + &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass1_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass2_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64, + &left[2], &right[2], &left[6], &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64, + &left[1], &right[1], &left[7], &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64, + &left[5], &right[5], &left[3], &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[2], &right[2], &left[6], + &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[1], &right[1], &left[7], + &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[5], &right[5], &left[3], + &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, + int32x4_t *right) { + int32x4x2_t out[8]; + vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + left[0] = out[0].val[0]; + right[0] = out[0].val[1]; + left[1] = out[1].val[0]; + right[1] = out[1].val[1]; + left[2] = out[2].val[0]; + right[2] = out[2].val[1]; + left[3] = out[3].val[0]; + right[3] = out[3].val[1]; + left[4] = out[4].val[0]; + right[4] = out[4].val[1]; + left[5] = out[5].val[0]; + right[5] = out[5].val[1]; + left[6] = out[6].val[0]; + right[6] = out[6].val[1]; + left[7] = out[7].val[0]; + right[7] = out[7].val[1]; +} + +static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left, + int32x4_t *right) { + int32x4x2_t out[8]; + vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right); + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + left[0] = out[0].val[0]; + right[0] = out[0].val[1]; + left[1] = out[1].val[0]; + right[1] = out[1].val[1]; + left[2] = out[2].val[0]; + right[2] = out[2].val[1]; + left[3] = out[3].val[0]; + right[3] = out[3].val[1]; + left[4] = out[4].val[0]; + right[4] = out[4].val[1]; + left[5] = out[5].val[0]; + right[5] = out[5].val[1]; + left[6] = out[6].val[0]; + right[6] = out[6].val[1]; + left[7] = out[7].val[0]; + right[7] = out[7].val[1]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/fdct_neon.h b/libvpx/vpx_dsp/arm/fdct_neon.h index 28d7d86bf..193594e3d 100644 --- a/libvpx/vpx_dsp/arm/fdct_neon.h +++ b/libvpx/vpx_dsp/arm/fdct_neon.h @@ -13,201 +13,411 @@ #include <arm_neon.h> -static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { - const int16x8_t input_01 = vcombine_s16(in[0], in[1]); - const int16x8_t input_32 = vcombine_s16(in[3], in[2]); - - // in_0 +/- in_3, in_1 +/- in_2 - const int16x8_t s_01 = vaddq_s16(input_01, input_32); - const int16x8_t s_32 = vsubq_s16(input_01, input_32); - - // step_0 +/- step_1, step_2 +/- step_3 - const int16x4_t s_0 = vget_low_s16(s_01); - const int16x4_t s_1 = vget_high_s16(s_01); - const int16x4_t s_2 = vget_high_s16(s_32); - const int16x4_t s_3 = vget_low_s16(s_32); - - // (s_0 +/- s_1) * cospi_16_64 - // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. - const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); - const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); - const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); - const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); - - // fdct_round_shift - int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); - int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); - - // s_3 * cospi_8_64 + s_2 * cospi_24_64 - // s_3 * cospi_24_64 - s_2 * cospi_8_64 - const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); - const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); - - const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); - const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); - - // fdct_round_shift - int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); - int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); - - transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); - - in[0] = out_0; - in[1] = out_1; - in[2] = out_2; - in[3] = out_3; -} - -static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, - int16x8_t *out) { - const int16x8_t v_s0 = vaddq_s16(in[0], in[7]); - const int16x8_t v_s1 = vaddq_s16(in[1], in[6]); - const int16x8_t v_s2 = vaddq_s16(in[2], in[5]); - const int16x8_t v_s3 = vaddq_s16(in[3], in[4]); - const int16x8_t v_s4 = vsubq_s16(in[3], in[4]); - const int16x8_t v_s5 = vsubq_s16(in[2], in[5]); - const int16x8_t v_s6 = vsubq_s16(in[1], in[6]); - const int16x8_t v_s7 = vsubq_s16(in[0], in[7]); - // fdct4(step, step); - int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); - int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); - int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); - int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); - // fdct4(step, step); - int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64); - int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64); - int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64); - int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64); - v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64); - v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out[0] = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 - out[2] = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 - out[4] = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 - out[6] = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 - } - // Stage 2 - v_x0 = vsubq_s16(v_s6, v_s5); - v_x1 = vaddq_s16(v_s6, v_s5); - v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x8_t ab = vcombine_s16(a, b); - const int16x8_t cd = vcombine_s16(c, d); - // Stage 3 - v_x0 = vaddq_s16(v_s4, ab); - v_x1 = vsubq_s16(v_s4, ab); - v_x2 = vsubq_s16(v_s7, cd); - v_x3 = vaddq_s16(v_s7, cd); - } - // Stage 4 - v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64); - v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64); - v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64); - v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64); - v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64); - v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64); - v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64); - v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64); - v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64); - v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64); - v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out[1] = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 - out[3] = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 - out[5] = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 - out[7] = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 - } -} - -static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { - int16x8_t out[8]; - vpx_fdct8x8_pass1_notranspose_neon(in, out); - // transpose 8x8 - // Can't use transpose_s16_8x8() because the values are arranged in two 4x8 - // columns. - { - // 00 01 02 03 40 41 42 43 - // 10 11 12 13 50 51 52 53 - // 20 21 22 23 60 61 62 63 - // 30 31 32 33 70 71 72 73 - // 04 05 06 07 44 45 46 47 - // 14 15 16 17 54 55 56 57 - // 24 25 26 27 64 65 66 67 - // 34 35 36 37 74 75 76 77 - const int32x4x2_t r02_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2])); - const int32x4x2_t r13_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3])); - const int32x4x2_t r46_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6])); - const int32x4x2_t r57_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7])); - const int16x8x2_t r01_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), - vreinterpretq_s16_s32(r13_s32.val[0])); - const int16x8x2_t r23_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), - vreinterpretq_s16_s32(r13_s32.val[1])); - const int16x8x2_t r45_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), - vreinterpretq_s16_s32(r57_s32.val[0])); - const int16x8x2_t r67_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), - vreinterpretq_s16_s32(r57_s32.val[1])); - in[0] = r01_s16.val[0]; - in[1] = r01_s16.val[1]; - in[2] = r23_s16.val[0]; - in[3] = r23_s16.val[1]; - in[4] = r45_s16.val[0]; - in[5] = r45_s16.val[1]; - in[6] = r67_s16.val[0]; - in[7] = r67_s16.val[1]; - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulh_s16 operation on half vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant, + int16x4_t *add, + int16x4_t *sub) { + int16x4_t c = vdup_n_s16(2 * constant); + *add = vqrdmulh_s16(vadd_s16(a, b), c); + *sub = vqrdmulh_s16(vsub_s16(a, b), c); } + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulh_s16 operation on full vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a, + const int16x8_t b, + const tran_coef_t constant, + int16x8_t *add, + int16x8_t *sub) { + int16x8_t c = vdupq_n_s16(2 * constant); + *add = vqrdmulhq_s16(vaddq_s16(a, b), c); + *sub = vqrdmulhq_s16(vsubq_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + int32x4_t c = vdupq_n_s32(constant << 17); + const int16x4_t a_lo = vget_low_s16(a); + const int16x4_t a_hi = vget_high_s16(a); + const int16x4_t b_lo = vget_low_s16(b); + const int16x4_t b_hi = vget_high_s16(b); + *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add_lo, add_hi, sub_lo, sub_hi; + butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo, + &sub_hi); + *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi)); + *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi)); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int32x4_t *add, int32x4_t *sub) { + int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddl_s16(a, b), c); + *sub = vqrdmulhq_s32(vsubl_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on half vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int16x4_t *add, int16x4_t *sub) { + int32x4_t add32, sub32; + butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32); + *add = vmovn_s32(add32); + *sub = vmovn_s32(sub32); +} + +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values +static INLINE void butterfly_one_coeff_s16_s32( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi; + butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo, + &sub32_hi); + *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi)); + *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi)); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant); + *add_lo = vmlaq_n_s32(a1, b_lo, constant); + *add_hi = vmlaq_n_s32(a2, b_hi, constant); + *sub_lo = vmlsq_n_s32(a3, b_lo, constant); + *sub_hi = vmlsq_n_s32(a4, b_hi, constant); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a, + const int32x4_t b, + const tran_coef_t constant, + int32x4_t *add, + int32x4_t *sub) { + const int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddq_s32(a, b), c); + *sub = vqrdmulhq_s32(vsubq_s32(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t c = vdupq_n_s32(constant << 17); + *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow_half( + const int32x4_t a, const int32x4_t b, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) { + const int32x2_t a_lo = vget_low_s32(a); + const int32x2_t a_hi = vget_high_s32(a); + const int32x2_t b_lo = vget_low_s32(b); + const int32x2_t b_hi = vget_high_s32(b); + + const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1); + const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1); + const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2); + const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2); + + const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2); + const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2); + const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1); + const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1); + + *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), + vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); + *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), + vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + // ac1/ac2 hold the following values: + // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1, + // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1 + // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2, + // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2 + int64x2_t ac1[4]; + int64x2_t ac2[4]; + int64x2_t sum[4]; + int64x2_t diff[4]; + + ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1); + ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1); + ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1); + ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1); + ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2); + ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2); + ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2); + ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2); + + sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2); + sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2); + sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2); + sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2); + *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS), + vrshrn_n_s64(sum[1], DCT_CONST_BITS)); + *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS), + vrshrn_n_s64(sum[3], DCT_CONST_BITS)); + + diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1); + diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1); + diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1); + diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1); + *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS), + vrshrn_n_s64(diff[1], DCT_CONST_BITS)); + *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS), + vrshrn_n_s64(diff[3], DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s16_s32_noround( + const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo, + const int16x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmull_n_s16(a_lo, constant1); + const int32x4_t a2 = vmull_n_s16(a_hi, constant1); + const int32x4_t a3 = vmull_n_s16(a_lo, constant2); + const int32x4_t a4 = vmull_n_s16(a_hi, constant2); + *add_lo = vmlal_n_s16(a1, b_lo, constant2); + *add_hi = vmlal_n_s16(a2, b_hi, constant2); + *sub_lo = vmlsl_n_s16(a3, b_lo, constant1); + *sub_hi = vmlsl_n_s16(a4, b_hi, constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + *add_lo = vmlaq_n_s32(a1, b_lo, constant2); + *add_hi = vmlaq_n_s32(a2, b_hi, constant2); + *sub_lo = vmlsq_n_s32(a3, b_lo, constant1); + *sub_hi = vmlsq_n_s32(a4, b_hi, constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x4_t *add, int16x4_t *sub) { + const int32x4_t a1 = vmull_n_s16(a, constant1); + const int32x4_t a2 = vmull_n_s16(a, constant2); + const int32x4_t sum = vmlal_n_s16(a1, b, constant2); + const int32x4_t diff = vmlsl_n_s16(a2, b, constant1); + *add = vqrshrn_n_s32(sum, DCT_CONST_BITS); + *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1); + const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1); + const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2); + const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2); + const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2); + const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2); + const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1); + const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2); + const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2); + const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1); + const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { + const int16x8_t one = vdupq_n_s16(1); + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift and round, +// return narrowed results +static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo, + const int32x4_t a_hi) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); + const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); + const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); + const int16x4_t b_lo = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); + const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); + const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); + const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); + const int16x4_t b_hi = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); + return vcombine_s16(b_lo, b_hi); +} + +// Add 1 if negative, and shift by 1. +// In practice, add the sign bit, then shift and round +static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) { + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2); +} + #endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/libvpx/vpx_dsp/arm/fdct_partial_neon.c index 0a1cdca41..718dba0d9 100644 --- a/libvpx/vpx_dsp/arm/fdct_partial_neon.c +++ b/libvpx/vpx_dsp/arm/fdct_partial_neon.c @@ -101,3 +101,68 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, output[0] = (tran_low_t)(sum >> 3); output[1] = 0; } + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int32_t sum; + + int r = 0; + do { + const int16x8_t a = vld1q_s16(input); + const int16x8_t b = vld1q_s16(input + 8); + input += stride; + partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b)); + r++; + } while (r < 16); + + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]); + partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]); + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]); + sum = horizontal_add_int32x4(partial_sum[0]); + + output[0] = (tran_low_t)(sum >> 1); + output[1] = 0; +} + +void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + + int32_t sum; + + int r = 0; + do { + const int16x8_t a0 = vld1q_s16(input); + const int16x8_t a1 = vld1q_s16(input + 8); + const int16x8_t a2 = vld1q_s16(input + 16); + const int16x8_t a3 = vld1q_s16(input + 24); + input += stride; + partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0)); + partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3)); + r++; + } while (r < 32); + + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]); + partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]); + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]); + sum = horizontal_add_int32x4(partial_sum[0]); + + output[0] = (tran_low_t)(sum >> 3); + output[1] = 0; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c deleted file mode 100644 index d9161c6d3..000000000 --- a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> - -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/txfm_common.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/arm/idct_neon.h" -#include "vpx_dsp/arm/fdct_neon.h" -#include "vpx_dsp/arm/mem_neon.h" - -void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, - int stride) { - int i; - // stage 1 - int16x8_t in[8]; - in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); - in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); - in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); - in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); - in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); - in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); - in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); - in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); - for (i = 0; i < 2; ++i) { - vpx_fdct8x8_pass1_neon(in); - } // for - { - // from vpx_dct_sse2.c - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15); - const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15); - const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15); - const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15); - const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15); - const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15); - const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15); - const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15); - in[0] = vhsubq_s16(in[0], sign_in0); - in[1] = vhsubq_s16(in[1], sign_in1); - in[2] = vhsubq_s16(in[2], sign_in2); - in[3] = vhsubq_s16(in[3], sign_in3); - in[4] = vhsubq_s16(in[4], sign_in4); - in[5] = vhsubq_s16(in[5], sign_in5); - in[6] = vhsubq_s16(in[6], sign_in6); - in[7] = vhsubq_s16(in[7], sign_in7); - // store results - store_s16q_to_tran_low(final_output + 0 * 8, in[0]); - store_s16q_to_tran_low(final_output + 1 * 8, in[1]); - store_s16q_to_tran_low(final_output + 2 * 8, in[2]); - store_s16q_to_tran_low(final_output + 3 * 8, in[3]); - store_s16q_to_tran_low(final_output + 4 * 8, in[4]); - store_s16q_to_tran_low(final_output + 5 * 8, in[5]); - store_s16q_to_tran_low(final_output + 6 * 8, in[6]); - store_s16q_to_tran_low(final_output + 7 * 8, in[7]); - } -} diff --git a/libvpx/vpx_dsp/arm/hadamard_neon.c b/libvpx/vpx_dsp/arm/hadamard_neon.c index 523a63c6f..f6b6d7e3c 100644 --- a/libvpx/vpx_dsp/arm/hadamard_neon.c +++ b/libvpx/vpx_dsp/arm/hadamard_neon.c @@ -114,3 +114,45 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, coeff += 8; } } + +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int i; + + /* Rearrange 32x32 to 16x64 and remove stride. + * Top left first. */ + vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride, + coeff + 256); + /* Bottom left. */ + vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride, + coeff + 512); + /* Bottom right. */ + vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride, + coeff + 768); + + for (i = 0; i < 256; i += 8) { + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768); + + const int16x8_t b0 = vhaddq_s16(a0, a1); + const int16x8_t b1 = vhsubq_s16(a0, a1); + const int16x8_t b2 = vhaddq_s16(a2, a3); + const int16x8_t b3 = vhsubq_s16(a2, a3); + + const int16x8_t c0 = vhaddq_s16(b0, b2); + const int16x8_t c1 = vhaddq_s16(b1, b3); + const int16x8_t c2 = vhsubq_s16(b0, b2); + const int16x8_t c3 = vhsubq_s16(b1, b3); + + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 256, c1); + store_s16q_to_tran_low(coeff + 512, c2); + store_s16q_to_tran_low(coeff + 768, c3); + + coeff += 8; + } +} diff --git a/libvpx/vpx_dsp/arm/highbd_quantize_neon.c b/libvpx/vpx_dsp/arm/highbd_quantize_neon.c new file mode 100644 index 000000000..b9f72a94c --- /dev/null +++ b/libvpx/vpx_dsp/arm/highbd_quantize_neon.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( + const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1, + tran_low_t *dqcoeff_ptr) { + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +} + +static VPX_FORCE_INLINE void highbd_quantize_8_neon( + const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin, + const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift, + int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) { + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31); + const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31); + const int32x4_t coeff_0_abs = vabsq_s32(coeff_0); + const int32x4_t coeff_1_abs = vabsq_s32(coeff_1); + + // Calculate 2 masks of elements outside the bin + const int32x4_t zbin_mask_0 = + vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin)); + const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32( + vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1))); + + // Get the rounded values + const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round); + const int32x4_t rounded_1 = + vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1)); + + // (round * (quant << 15) * 2) >> 16 == (round * quant) + int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant); + int32x4_t qcoeff_tmp_1 = + vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1)); + + // Add rounded values + qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0); + qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1); + + // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift) + qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift); + qcoeff_tmp_1 = + vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1)); + + // Restore the sign bit. + qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign); + qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign); + qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign); + qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign); + + // Only keep the relevant coeffs + *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0); + *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1); +} + +static VPX_FORCE_INLINE int16x8_t +highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32x4_t zbin, + const int32x4_t round, const int32x4_t quant, + const int32x4_t quant_shift, const int32x4_t dequant) { + int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1; + + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0 = vld1q_s32(coeff_ptr); + const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4); + highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift, + &qcoeff_0, &qcoeff_1); + + // Store the 32-bit qcoeffs + vst1q_s32(qcoeff_ptr, qcoeff_0); + vst1q_s32(qcoeff_ptr + 4, qcoeff_1); + + // Calculate and store the dqcoeffs + dqcoeff_0 = vmulq_s32(qcoeff_0, dequant); + dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1)); + + highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr); + + return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1)); +} + +void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + + // Only the first element of each vector is DC. + // High half has identical elements, but we can reconstruct it from the low + // half by duplicating the 2nd element. So we only need to pass a 4x32-bit + // vector + int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr)); + int32x4_t round = vmovl_s16(vld1_s16(round_ptr)); + // Extend the quant, quant_shift vectors to ones of 32-bit elements + // scale to high-half, so we can use vqdmulhq_s32 + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); + int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15); + int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + n_coeffs -= 8; + + { + zbin = vdupq_lane_s32(vget_low_s32(zbin), 1); + round = vdupq_lane_s32(vget_low_s32(round), 1); + quant = vdupq_lane_s32(vget_low_s32(quant), 1); + quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1); + dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); + + do { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + n_coeffs -= 8; + } while (n_coeffs > 0); + } + +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)n_coeffs; + (void)scan; +} + +static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32( + int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) { + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); + dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +} + +static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round, + const int32x4_t quant, const int32x4_t quant_shift, + const int32x4_t dequant) { + int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1; + + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0 = vld1q_s32(coeff_ptr); + const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4); + highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift, + &qcoeff_0, &qcoeff_1); + + // Store the 32-bit qcoeffs + vst1q_s32(qcoeff_ptr, qcoeff_0); + vst1q_s32(qcoeff_ptr + 4, qcoeff_1); + + // Calculate and store the dqcoeffs + dqcoeff_0 = vmulq_s32(qcoeff_0, dequant); + dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1)); + + highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr); + + return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1)); +} + +void vpx_highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int i; + + // Only the first element of each vector is DC. + // High half has identical elements, but we can reconstruct it from the low + // half by duplicating the 2nd element. So we only need to pass a 4x32-bit + // vector + int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1); + int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1); + // Extend the quant, quant_shift vectors to ones of 32-bit elements + // scale to high-half, so we can use vqdmulhq_s32 + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15); + int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16); + int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + { + zbin = vdupq_lane_s32(vget_low_s32(zbin), 1); + round = vdupq_lane_s32(vget_low_s32(round), 1); + quant = vdupq_lane_s32(vget_low_s32(quant), 1); + quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1); + dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); + + for (i = 1; i < 32 * 32 / 8; ++i) { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + } + +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)n_coeffs; + (void)scan; +} diff --git a/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/libvpx/vpx_dsp/arm/highbd_sad_neon.c new file mode 100644 index 000000000..ecb52ce5a --- /dev/null +++ b/libvpx/vpx_dsp/arm/highbd_sad_neon.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int width, + int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + const uint16x4_t src_u16 = vld1_u16(src16_ptr + j); + const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j); + sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int width, + int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j); + const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j); + sum_abs_diff = + vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16)); + sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16), + vget_high_u16(ref_u16)); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, const uint8_t *second_pred, int width, int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + const uint16x4_t a_u16 = vld1_u16(src16_ptr + j); + const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j); + const uint16x4_t c_u16 = vld1_u16(pred_ptr + j); + const uint16x4_t avg = vrhadd_u16(b_u16, c_u16); + sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred_ptr += width; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, const uint8_t *second_pred, int width, int height) { + int i, j; + uint32x4_t sum_abs_diff = vdupq_n_u32(0); + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j); + const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j); + const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j); + const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16); + sum_abs_diff = + vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg)); + sum_abs_diff = + vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg)); + } + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred_ptr += width; + } + + return horizontal_add_uint32x4(sum_abs_diff); +} + +#define highbd_sad4MxN(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } + +#define highbd_sadMxN(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } + +#define highbd_sad4MxN_avg(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ + second_pred, m, n); \ + } + +#define highbd_sadMxN_avg(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ + second_pred, m, n); \ + } + +#define highbd_sadMxNx4D(m, n) \ + void vpx_highbd_sad##m##x##n##x4d_neon( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride, \ + ref_array[i], ref_stride); \ + } \ + } + +/* clang-format off */ +// 4x4 +highbd_sad4MxN(4, 4) +highbd_sad4MxN_avg(4, 4) +highbd_sadMxNx4D(4, 4) + +// 4x8 +highbd_sad4MxN(4, 8) +highbd_sad4MxN_avg(4, 8) +highbd_sadMxNx4D(4, 8) + +// 8x4 +highbd_sadMxN(8, 4) +highbd_sadMxN_avg(8, 4) +highbd_sadMxNx4D(8, 4) + +// 8x8 +highbd_sadMxN(8, 8) +highbd_sadMxN_avg(8, 8) +highbd_sadMxNx4D(8, 8) + +// 8x16 +highbd_sadMxN(8, 16) +highbd_sadMxN_avg(8, 16) +highbd_sadMxNx4D(8, 16) + +// 16x8 +highbd_sadMxN(16, 8) +highbd_sadMxN_avg(16, 8) +highbd_sadMxNx4D(16, 8) + +// 16x16 +highbd_sadMxN(16, 16) +highbd_sadMxN_avg(16, 16) +highbd_sadMxNx4D(16, 16) + +// 16x32 +highbd_sadMxN(16, 32) +highbd_sadMxN_avg(16, 32) +highbd_sadMxNx4D(16, 32) + +// 32x16 +highbd_sadMxN(32, 16) +highbd_sadMxN_avg(32, 16) +highbd_sadMxNx4D(32, 16) + +// 32x32 +highbd_sadMxN(32, 32) +highbd_sadMxN_avg(32, 32) +highbd_sadMxNx4D(32, 32) + +// 32x64 +highbd_sadMxN(32, 64) +highbd_sadMxN_avg(32, 64) +highbd_sadMxNx4D(32, 64) + +// 64x32 +highbd_sadMxN(64, 32) +highbd_sadMxN_avg(64, 32) +highbd_sadMxNx4D(64, 32) + +// 64x64 +highbd_sadMxN(64, 64) +highbd_sadMxN_avg(64, 64) +highbd_sadMxNx4D(64, 64) + /* clang-format on */ diff --git a/libvpx/vpx_dsp/arm/highbd_variance_neon.c b/libvpx/vpx_dsp/arm/highbd_variance_neon.c new file mode 100644 index 000000000..96a35af01 --- /dev/null +++ b/libvpx/vpx_dsp/arm/highbd_variance_neon.c @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + int w, int h, uint64_t *sse, + int64_t *sum) { + int i, j; + + if (w >= 8) { + int32x4_t sum_s32 = vdupq_n_s32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j])); + const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j])); + const int32x4_t diff1_s32 = + vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16)); + const int32x4_t diff2_s32 = + vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16)); + const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32); + const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32); + sum_s32 = vaddq_s32(sum_s32, diff1_s32); + sum_s32 = vaddq_s32(sum_s32, diff2_s32); + sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32); + sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32); + } + src_ptr += src_stride; + ref_ptr += ref_stride; + } + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_add_uint32x4(sse_u32); + } else { + int32x4_t sum_s32 = vdupq_n_s32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + assert(w >= 4); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 4) { + const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j])); + const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j])); + const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16); + const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32); + sum_s32 = vaddq_s32(sum_s32, diff_s32); + sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32); + } + src_ptr += src_stride; + ref_ptr += ref_stride; + } + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_add_uint32x4(sse_u32); + } +} + +static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint64_t *sse, + int64_t *sum) { + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); + + if (w < 32 && h < 32) { + highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum); + } else { + uint64_t sse_long = 0; + int64_t sum_long = 0; + int k, l; + for (k = 0; k + 16 <= h; k += 16) { + for (l = 0; l + 16 <= w; l += 16) { + uint64_t sse_tmp = 0; + int64_t sum_tmp = 0; + highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16, + 16, &sse_tmp, &sum_tmp); + sum_long += sum_tmp; + sse_long += sse_tmp; + } + src_ptr += 16 * src_stride; + ref_ptr += 16 * ref_stride; + } + *sum = sum_long; + *sse = sse_long; + } +} + +static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)sse_long; + *sum = (int)sum_long; +} + +static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ + uint32_t vpx_highbd_8_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } + +#define HIGHBD_MSE(W, H) \ + uint32_t vpx_highbd_8_mse##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } + +static INLINE void highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + uint32_t i, j; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + + uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); + uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); + uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); + const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); + uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); + uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); + uint16x4_t out1_u16; + uint16x4_t out2_u16; + sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); + sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); + out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); + out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); + vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } else { + assert(output_width >= 4); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); + const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); + uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); + uint16x4_t out_u16; + sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); + out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); + vst1_u16(&output_ptr[j], out_u16); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } +} + +static INLINE void highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + uint32_t i, j; + + uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); + uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); + uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); + const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); + uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); + uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); + uint16x4_t out1_u16; + uint16x4_t out2_u16; + sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); + sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); + out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); + out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); + vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } else { + assert(output_width >= 4); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); + const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); + uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); + uint16x4_t out_u16; + sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); + out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); + vst1_u16(&output_ptr[j], out_u16); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } + } +} + +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_10_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_12_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ + } + +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ + H, temp2, W); \ + \ + return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ + H, temp2, W); \ + \ + return vpx_highbd_10_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ + H, temp2, W); \ + \ + return vpx_highbd_12_variance##W##x##H##_neon( \ + CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ + } + +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + uint32x4_t one_u32 = vdupq_n_u32(1); + if (width >= 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 8) { + const uint16x8_t pred_u16 = vld1q_u16(&pred[j]); + const uint16x8_t ref_u16 = vld1q_u16(&ref[j]); + const uint32x4_t sum1_u32 = + vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16)); + const uint32x4_t sum2_u32 = + vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16)); + const uint16x4_t sum1_u16 = + vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1); + const uint16x4_t sum2_u16 = + vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1); + const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16); + vst1q_u16(&comp_pred[j], vcomp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else { + assert(width >= 4); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 4) { + const uint16x4_t pred_u16 = vld1_u16(&pred[j]); + const uint16x4_t ref_u16 = vld1_u16(&ref[j]); + const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16); + const uint16x4_t vcomp_pred = + vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1); + vst1_u16(&comp_pred[j], vcomp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } +} + +/* All three forms of the variance are available in the same sizes. */ +#define HIGHBD_VARIANCES(W, H) \ + HIGHBD_VAR(W, H) \ + HIGHBD_SUBPIX_VAR(W, H) \ + HIGHBD_SUBPIX_AVG_VAR(W, H) + +HIGHBD_VARIANCES(64, 64) +HIGHBD_VARIANCES(64, 32) +HIGHBD_VARIANCES(32, 64) +HIGHBD_VARIANCES(32, 32) +HIGHBD_VARIANCES(32, 16) +HIGHBD_VARIANCES(16, 32) +HIGHBD_VARIANCES(16, 16) +HIGHBD_VARIANCES(16, 8) +HIGHBD_VARIANCES(8, 16) +HIGHBD_VARIANCES(8, 8) +HIGHBD_VARIANCES(8, 4) +HIGHBD_VARIANCES(4, 8) +HIGHBD_VARIANCES(4, 4) + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) diff --git a/libvpx/vpx_dsp/arm/mem_neon.h b/libvpx/vpx_dsp/arm/mem_neon.h index 50aaa94fe..19cfc7c7f 100644 --- a/libvpx/vpx_dsp/arm/mem_neon.h +++ b/libvpx/vpx_dsp/arm/mem_neon.h @@ -116,11 +116,11 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x2_t a_u32 = vdup_n_u32(0); + uint32x2_t a_u32; if (stride == 4) return vld1_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vset_lane_u32(a, a_u32, 0); + a_u32 = vdup_n_u32(a); memcpy(&a, buf, 4); a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); @@ -143,11 +143,11 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, ptrdiff_t stride) { uint32_t a; - uint32x4_t a_u32 = vdupq_n_u32(0); + uint32x4_t a_u32; if (stride == 4) return vld1q_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vsetq_lane_u32(a, a_u32, 0); + a_u32 = vdupq_n_u32(a); memcpy(&a, buf, 4); buf += stride; a_u32 = vsetq_lane_u32(a, a_u32, 1); @@ -201,4 +201,161 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { buf += stride; vst1_lane_u32((uint32_t *)buf, a_u32, 1); } + +static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); +} + +static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); +} + +static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); +} + +static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p, + const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, const uint8x16_t s3) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); +} + +static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); +} + +static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); +} + +static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t s4, const uint8x8_t s5, + const uint8x8_t s6, const uint8x8_t s7) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); + s += p; + vst1_u8(s, s4); + s += p; + vst1_u8(s, s5); + s += p; + vst1_u8(s, s6); + s += p; + vst1_u8(s, s7); +} + +static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4, uint8x16_t *const s5, + uint8x16_t *const s6, uint8x16_t *const s7) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); + s += p; + *s5 = vld1q_u8(s); + s += p; + *s6 = vld1q_u8(s); + s += p; + *s7 = vld1q_u8(s); +} + +static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p, + const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, const uint8x16_t s3, + const uint8x16_t s4, const uint8x16_t s5, + const uint8x16_t s6, const uint8x16_t s7) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); + s += p; + vst1q_u8(s, s4); + s += p; + vst1q_u8(s, s5); + s += p; + vst1q_u8(s, s6); + s += p; + vst1q_u8(s, s7); +} + #endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/quantize_neon.c b/libvpx/vpx_dsp/arm/quantize_neon.c index bd7818a07..9c227d560 100644 --- a/libvpx/vpx_dsp/arm/quantize_neon.c +++ b/libvpx/vpx_dsp/arm/quantize_neon.c @@ -17,20 +17,57 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, const int16x8_t dequant, - tran_low_t *dqcoeff) { + tran_low_t *dqcoeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH const int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); const int32x4_t dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); -#if CONFIG_VP9_HIGHBITDEPTH - vst1q_s32(dqcoeff, dqcoeff_0); - vst1q_s32(dqcoeff + 4, dqcoeff_1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); #else - vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1))); + vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant)); #endif // CONFIG_VP9_HIGHBITDEPTH } +static INLINE int16x8_t +quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 + qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -38,109 +75,59 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; - (void)scan; + + // Only the first element of each vector is DC. + int16x8_t zbin = vld1q_s16(zbin_ptr); + int16x8_t round = vld1q_s16(round_ptr); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. { - // Only the first element of each vector is DC. - const int16x8_t zbin = vld1q_s16(zbin_ptr); - const int16x8_t round = vld1q_s16(round_ptr); - const int16x8_t quant = vld1q_s16(quant_ptr); - const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); - const int16x8_t dequant = vld1q_s16(dequant_ptr); - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 - qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant, + quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } n_coeffs -= 8; { - const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]); - const int16x8_t round = vdupq_n_s16(round_ptr[1]); - const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); - const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); - const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); do { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 - qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; - n_coeffs -= 8; } while (n_coeffs > 0); } @@ -156,6 +143,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)scan; } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -164,7 +154,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) { static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, const int16x8_t dequant, - tran_low_t *dqcoeff) { + tran_low_t *dqcoeff_ptr) { int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); int32x4_t dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); @@ -176,14 +166,51 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, #if CONFIG_VP9_HIGHBITDEPTH dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); - vst1q_s32(dqcoeff, dqcoeff_0); - vst1q_s32(dqcoeff + 4, dqcoeff_1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); #else - vst1q_s16(dqcoeff, + vst1q_s16(dqcoeff_ptr, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1))); #endif // CONFIG_VP9_HIGHBITDEPTH } +static INLINE int16x8_t +quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant_shift); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -194,107 +221,57 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; - (void)scan; - (void)n_coeffs; // Because we will always calculate 32*32. + + // Only the first element of each vector is DC. + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. { - // Only the first element of each vector is DC. - const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); - const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - const int16x8_t quant = vld1q_s16(quant_ptr); - const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); - const int16x8_t dequant = vld1q_s16(dequant_ptr); - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); - - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 - qcoeff = vqdmulhq_s16(qcoeff, quant_shift); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } { - const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1); - const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1); - const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); - const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); - const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); for (i = 1; i < 32 * 32 / 8; ++i) { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 - qcoeff = vqdmulhq_s16(qcoeff, quant_shift); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } } @@ -310,4 +287,8 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)n_coeffs; + (void)scan; } diff --git a/libvpx/vpx_dsp/arm/sad4d_neon.c b/libvpx/vpx_dsp/arm/sad4d_neon.c index 03f716c3d..5fc621aee 100644 --- a/libvpx/vpx_dsp/arm/sad4d_neon.c +++ b/libvpx/vpx_dsp/arm/sad4d_neon.c @@ -20,9 +20,9 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, const void *const buf1) { uint32_t a; - uint32x2_t aa = vdup_n_u32(0); + uint32x2_t aa; memcpy(&a, buf0, 4); - aa = vset_lane_u32(a, aa, 0); + aa = vdup_n_u32(a); memcpy(&a, buf1, 4); aa = vset_lane_u32(a, aa, 1); return vreinterpret_u8_u32(aa); @@ -237,8 +237,7 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, uint32x4_t *const sum) { @@ -270,7 +269,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, uint16x8_t *const sum) { @@ -305,7 +304,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, sad_512_pel_final_neon(sum, sad_array); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -327,8 +326,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -386,7 +384,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -444,12 +442,11 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad_2048_pel_final_neon(sum, sad_array); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) //////////////////////////////////////////////////////////////////////////////// -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -554,7 +551,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } -#else +#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, @@ -649,4 +646,4 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, sad_4096_pel_final_neon(sum, sad_array); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) diff --git a/libvpx/vpx_dsp/arm/sad_neon.c b/libvpx/vpx_dsp/arm/sad_neon.c index b1509d883..ad575d4aa 100644 --- a/libvpx/vpx_dsp/arm/sad_neon.c +++ b/libvpx/vpx_dsp/arm/sad_neon.c @@ -21,9 +21,15 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); +#if defined(__ARM_FEATURE_DOTPROD) + const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); + const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); + return horizontal_add_uint32x4(dp); +#else uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); return horizontal_add_uint16x8(abs); +#endif } uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, @@ -33,13 +39,34 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); +#if defined(__ARM_FEATURE_DOTPROD) + const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg); + const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1)); + return horizontal_add_uint32x4(prod); +#else uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); return horizontal_add_uint16x8(abs); +#endif } uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { +#if defined(__ARM_FEATURE_DOTPROD) + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + const uint8x16_t src2_u8 = + load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); + const uint8x16_t ref2_u8 = + load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); + const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8); + const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8); + prod = vdotq_u32(prod, sad1_u8, ones); + prod = vdotq_u32(prod, sad2_u8, ones); + return horizontal_add_uint32x4(prod); +#else int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < 8; i += 4) { @@ -52,11 +79,31 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, } return horizontal_add_uint16x8(abs); +#endif } uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred) { +#if defined(__ARM_FEATURE_DOTPROD) + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride); + const uint8x16_t src2_u8 = + load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride); + const uint8x16_t ref2_u8 = + load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride); + const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred); + const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16); + const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8); + const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8); + const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1); + const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2); + prod = vdotq_u32(prod, sad1_u8, ones); + prod = vdotq_u32(prod, sad2_u8, ones); + return horizontal_add_uint32x4(prod); +#else int i; uint16x8_t abs = vdupq_n_u16(0); for (i = 0; i < 8; i += 4) { @@ -72,8 +119,65 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, } return horizontal_add_uint16x8(abs); +#endif } +#if defined(__ARM_FEATURE_DOTPROD) +static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x2_t prod = vdup_n_u32(0); + const uint8x8_t ones = vdup_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(src_ptr); + const uint8x8_t b_u8 = vld1_u8(ref_ptr); + const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdot_u32(prod, sad_u8, ones); + } + return prod; +} + +static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x2_t prod = vdup_n_u32(0); + const uint8x8_t ones = vdup_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x8_t a_u8 = vld1_u8(src_ptr); + const uint8x8_t b_u8 = vld1_u8(ref_ptr); + const uint8x8_t c_u8 = vld1_u8(second_pred); + const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); + const uint8x8_t sad_u8 = vabd_u8(a_u8, avg); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 8; + prod = vdot_u32(prod, sad_u8, ones); + } + return prod; +} + +#define SAD8XN(n) \ + uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x2_t prod = \ + sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return horizontal_add_uint32x2(prod); \ + } \ + \ + uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x2_t prod = \ + sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return horizontal_add_uint32x2(prod); \ + } + +#else // !defined(__ARM_FEATURE_DOTPROD) static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -124,11 +228,67 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride, sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } +#endif // defined(__ARM_FEATURE_DOTPROD) SAD8XN(4) SAD8XN(8) SAD8XN(16) +#if defined(__ARM_FEATURE_DOTPROD) +static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t src_u8 = vld1q_u8(src_ptr); + const uint8x16_t ref_u8 = vld1q_u8(ref_ptr); + const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdotq_u32(prod, sad_u8, ones); + } + return prod; +} + +static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_u8 = vld1q_u8(src_ptr); + const uint8x16_t b_u8 = vld1q_u8(ref_ptr); + const uint8x16_t c_u8 = vld1q_u8(second_pred); + const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); + const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + prod = vdotq_u32(prod, sad_u8, ones); + } + return prod; +} + +#define SAD16XN(n) \ + uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x4_t prod = \ + sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return horizontal_add_uint32x4(prod); \ + } \ + \ + uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x4_t prod = \ + sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return horizontal_add_uint32x4(prod); \ + } +#else // !defined(__ARM_FEATURE_DOTPROD) static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -182,11 +342,78 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride, sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } +#endif // defined(__ARM_FEATURE_DOTPROD) SAD16XN(8) SAD16XN(16) SAD16XN(32) +#if defined(__ARM_FEATURE_DOTPROD) +static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(src_ptr); + const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); + const uint8x16_t b_lo = vld1q_u8(ref_ptr); + const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); + const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo); + const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdotq_u32(prod, sad_lo_u8, ones); + prod = vdotq_u32(prod, sad_hi_u8, ones); + } + return prod; +} + +static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_lo = vld1q_u8(src_ptr); + const uint8x16_t a_hi = vld1q_u8(src_ptr + 16); + const uint8x16_t b_lo = vld1q_u8(ref_ptr); + const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16); + const uint8x16_t c_lo = vld1q_u8(second_pred); + const uint8x16_t c_hi = vld1q_u8(second_pred + 16); + const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); + const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); + const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo); + const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 32; + prod = vdotq_u32(prod, sad_lo_u8, ones); + prod = vdotq_u32(prod, sad_hi_u8, ones); + } + return prod; +} + +#define SAD32XN(n) \ + uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + const uint32x4_t prod = \ + sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \ + return horizontal_add_uint32x4(prod); \ + } \ + \ + uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + const uint8_t *second_pred) { \ + const uint32x4_t prod = \ + sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ + return horizontal_add_uint32x4(prod); \ + } + +#else // defined(__ARM_FEATURE_DOTPROD) static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -250,11 +477,81 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride, sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \ return horizontal_add_uint16x8(abs); \ } +#endif // defined(__ARM_FEATURE_DOTPROD) SAD32XN(16) SAD32XN(32) SAD32XN(64) +#if defined(__ARM_FEATURE_DOTPROD) +static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(src_ptr); + const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); + const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); + const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); + const uint8x16_t b_0 = vld1q_u8(ref_ptr); + const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); + const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); + const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0); + const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1); + const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2); + const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3); + src_ptr += src_stride; + ref_ptr += ref_stride; + prod = vdotq_u32(prod, sad_0_u8, ones); + prod = vdotq_u32(prod, sad_1_u8, ones); + prod = vdotq_u32(prod, sad_2_u8, ones); + prod = vdotq_u32(prod, sad_3_u8, ones); + } + return prod; +} + +static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const int height) { + int i; + uint32x4_t prod = vdupq_n_u32(0); + const uint8x16_t ones = vdupq_n_u8(1); + for (i = 0; i < height; ++i) { + const uint8x16_t a_0 = vld1q_u8(src_ptr); + const uint8x16_t a_1 = vld1q_u8(src_ptr + 16); + const uint8x16_t a_2 = vld1q_u8(src_ptr + 32); + const uint8x16_t a_3 = vld1q_u8(src_ptr + 48); + const uint8x16_t b_0 = vld1q_u8(ref_ptr); + const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16); + const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32); + const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48); + const uint8x16_t c_0 = vld1q_u8(second_pred); + const uint8x16_t c_1 = vld1q_u8(second_pred + 16); + const uint8x16_t c_2 = vld1q_u8(second_pred + 32); + const uint8x16_t c_3 = vld1q_u8(second_pred + 48); + const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); + const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); + const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); + const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); + const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0); + const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1); + const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2); + const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3); + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 64; + prod = vdotq_u32(prod, sad_0_u8, ones); + prod = vdotq_u32(prod, sad_1_u8, ones); + prod = vdotq_u32(prod, sad_2_u8, ones); + prod = vdotq_u32(prod, sad_3_u8, ones); + } + return prod; +} +#else // !defined(__ARM_FEATURE_DOTPROD) static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const int height) { @@ -332,6 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride, return vpadalq_u16(sum, abs_1); } } +#endif // defined(__ARM_FEATURE_DOTPROD) #define SAD64XN(n) \ uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \ diff --git a/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libvpx/vpx_dsp/arm/subpel_variance_neon.c index a3befdc34..9328c3ed8 100644 --- a/libvpx/vpx_dsp/arm/subpel_variance_neon.c +++ b/libvpx/vpx_dsp/arm/subpel_variance_neon.c @@ -17,168 +17,474 @@ #include "vpx_dsp/variance.h" #include "vpx_dsp/arm/mem_neon.h" -static const uint8_t bilinear_filters[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - // Process a block exactly 4 wide and a multiple of 2 high. -static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i; - for (i = 0; i < output_height; i += 2) { - const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line); - const uint8x8_t src_1 = - load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(output_ptr, out); - src_ptr += 2 * src_pixels_per_line; - output_ptr += 8; - } +static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + i -= 2; + } while (i != 0); } // Process a block exactly 8 wide and any height. -static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i; - for (i = 0; i < output_height; ++i) { - const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); - const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(output_ptr, out); - src_ptr += src_pixels_per_line; - output_ptr += 8; - } +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += src_stride; + dst_ptr += 8; + } while (--i != 0); } // Process a block which is a mutiple of 16 wide and any height. -static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { - const uint8x8_t f0 = vdup_n_u8(filter[0]); - const uint8x8_t f1 = vdup_n_u8(filter[1]); - unsigned int i, j; - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 16) { - const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); - const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); - const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); - const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); - const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); - const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); - const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); - const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); - vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi)); - } - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } +static void var_filter_block2d_bil_large(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3); + uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3); + vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi)); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, + dst_height, filter_offset); +} + +static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + vst1q_u8(dst_ptr + j, avg); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); } -// 4xM filter writes an extra row to fdata because it processes two rows at a -// time. -#define SUB_PIXEL_VARIANCENXM(n, m) \ - uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else { \ - var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[y_offset]); \ - } \ - return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse); \ +#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } -SUB_PIXEL_VARIANCENXM(4, 4) -SUB_PIXEL_VARIANCENXM(4, 8) -SUB_PIXEL_VARIANCENXM(8, 4) -SUB_PIXEL_VARIANCENXM(8, 8) -SUB_PIXEL_VARIANCENXM(8, 16) -SUB_PIXEL_VARIANCENXM(16, 8) -SUB_PIXEL_VARIANCENXM(16, 16) -SUB_PIXEL_VARIANCENXM(16, 32) -SUB_PIXEL_VARIANCENXM(32, 16) -SUB_PIXEL_VARIANCENXM(32, 32) -SUB_PIXEL_VARIANCENXM(32, 64) -SUB_PIXEL_VARIANCENXM(64, 32) -SUB_PIXEL_VARIANCENXM(64, 64) - -// 4xM filter writes an extra row to fdata because it processes two rows at a -// time. -#define SUB_PIXEL_AVG_VARIANCENXM(n, m) \ - uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ +#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \ + sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ + yoffset); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 4x<h> blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4. +static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + second_pred += 2 * 4; + i -= 2; + } while (i != 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8. +static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += src_stride; + dst_ptr += 8; + second_pred += 8; + } while (--i > 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for large blocks. +static void avg_pred_var_filter_block2d_bil_large( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x16_t blend_u8 = + vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); + + uint8x16_t p = vld1q_u8(second_pred); + uint8x16_t avg = vrhaddq_u8(blend_u8, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16. +static void avg_pred_var_filter_block2d_bil_w16( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32. +static void avg_pred_var_filter_block2d_bil_w32( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64. +static void avg_pred_var_filter_block2d_bil_w64( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +// Combine averaging subpel filter with vpx_comp_avg_pred. +static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, + const uint8_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + + uint8x16_t p = vld1q_u8(second_pred); + avg = vrhaddq_u8(avg, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of vpx_comp_avg_pred for blocks having width >= 16. +static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, + int dst_width, int dst_height, + const uint8_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s = vld1q_u8(src_ptr + j); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(s, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ const uint8_t *second_pred) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[y_offset]); \ - } else { \ - var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ - bilinear_filters[x_offset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[y_offset]); \ - } \ - \ - vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ - \ - return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \ + xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } + +#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + if (xoffset == 0) { \ + uint8_t tmp[w * h]; \ + if (yoffset == 0) { \ + avg_pred(src, tmp, source_stride, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ + source_stride, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } else { \ + avg_pred_var_filter_block2d_bil_w##w( \ + src, tmp, source_stride, source_stride, h, yoffset, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \ + second_pred); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \ + xoffset, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ } -SUB_PIXEL_AVG_VARIANCENXM(4, 4) -SUB_PIXEL_AVG_VARIANCENXM(4, 8) -SUB_PIXEL_AVG_VARIANCENXM(8, 4) -SUB_PIXEL_AVG_VARIANCENXM(8, 8) -SUB_PIXEL_AVG_VARIANCENXM(8, 16) -SUB_PIXEL_AVG_VARIANCENXM(16, 8) -SUB_PIXEL_AVG_VARIANCENXM(16, 16) -SUB_PIXEL_AVG_VARIANCENXM(16, 32) -SUB_PIXEL_AVG_VARIANCENXM(32, 16) -SUB_PIXEL_AVG_VARIANCENXM(32, 32) -SUB_PIXEL_AVG_VARIANCENXM(32, 64) -SUB_PIXEL_AVG_VARIANCENXM(64, 32) -SUB_PIXEL_AVG_VARIANCENXM(64, 64) +// 4x<h> blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) diff --git a/libvpx/vpx_dsp/arm/subtract_neon.c b/libvpx/vpx_dsp/arm/subtract_neon.c index 612897e24..2c008e48a 100644 --- a/libvpx/vpx_dsp/arm/subtract_neon.c +++ b/libvpx/vpx_dsp/arm/subtract_neon.c @@ -79,3 +79,59 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, } while (r); } } + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, + const uint8_t *src8_ptr, + ptrdiff_t src_stride, + const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + int r = rows, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + + if (cols >= 16) { + do { + for (c = 0; c < cols; c += 16) { + const uint16x8_t s0 = vld1q_u16(&src[c + 0]); + const uint16x8_t s1 = vld1q_u16(&src[c + 8]); + const uint16x8_t p0 = vld1q_u16(&pred[c + 0]); + const uint16x8_t p1 = vld1q_u16(&pred[c + 8]); + const uint16x8_t d0 = vsubq_u16(s0, p0); + const uint16x8_t d1 = vsubq_u16(s1, p1); + vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols >= 8) { + do { + for (c = 0; c < cols; c += 8) { + const uint16x8_t s = vld1q_u16(&src[c]); + const uint16x8_t p = vld1q_u16(&pred[c]); + const uint16x8_t d0 = vsubq_u16(s, p); + vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols >= 4) { + do { + for (c = 0; c < cols; c += 4) { + const uint16x4_t s = vld1_u16(&src[c]); + const uint16x4_t p = vld1_u16(&pred[c]); + const uint16x4_t v_diff = vsub_u16(s, p); + vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/arm/transpose_neon.h b/libvpx/vpx_dsp/arm/transpose_neon.h index c098ad31b..41d44f2b1 100644 --- a/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/libvpx/vpx_dsp/arm/transpose_neon.h @@ -568,6 +568,40 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); } +// Transpose 8x8 to a new location. +static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) { + // Swap 16 bit elements. + const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements. + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), + vreinterpretq_s32_s16(c3.val[0])); + const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), + vreinterpretq_s32_s16(c3.val[1])); + + // Swap 64 bit elements + const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); + const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); + const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); + const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); + + b[0] = e0.val[0]; + b[1] = e1.val[0]; + b[2] = e2.val[0]; + b[3] = e3.val[0]; + b[4] = e0.val[1]; + b[5] = e1.val[1]; + b[6] = e2.val[1]; + b[7] = e3.val[1]; +} + static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, @@ -787,6 +821,51 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1, a7->val[1] = c7.val[1]; } +// Helper transpose function for highbd FDCT variants +static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, + int32x4_t *right /*[8]*/, + int32x4_t *out_left /*[8]*/, + int32x4_t *out_right /*[8]*/) { + int32x4x2_t out[8]; + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + out_left[0] = out[0].val[0]; + out_left[1] = out[1].val[0]; + out_left[2] = out[2].val[0]; + out_left[3] = out[3].val[0]; + out_left[4] = out[4].val[0]; + out_left[5] = out[5].val[0]; + out_left[6] = out[6].val[0]; + out_left[7] = out[7].val[0]; + out_right[0] = out[0].val[1]; + out_right[1] = out[1].val[1]; + out_right[2] = out[2].val[1]; + out_right[3] = out[3].val[1]; + out_right[4] = out[4].val[1]; + out_right[5] = out[5].val[1]; + out_right[6] = out[6].val[1]; + out_right[7] = out[7].val[1]; +} + static INLINE void transpose_u8_16x8( const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, diff --git a/libvpx/vpx_dsp/arm/variance_neon.c b/libvpx/vpx_dsp/arm/variance_neon.c index 7b93f142b..3ccc4e807 100644 --- a/libvpx/vpx_dsp/arm/variance_neon.c +++ b/libvpx/vpx_dsp/arm/variance_neon.c @@ -19,345 +19,357 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1) +#if defined(__ARM_FEATURE_DOTPROD) // Process a block of width 4 four rows at a time. -static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i; - uint32x4_t sum_a = vdupq_n_u32(0); - uint32x4_t sum_b = vdupq_n_u32(0); +static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); - for (i = 0; i < h; i += 4) { - const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride); + int i = h; + do { + const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); - const uint8x16_t abs_diff = vabdq_u8(a, b); + const uint8x16_t abs_diff = vabdq_u8(s, r); sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); - sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); - sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); src_ptr += 4 * src_stride; ref_ptr += 4 * ref_stride; - } + i -= 4; + } while (i != 0); - *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); *sse = horizontal_add_uint32x4(sse_u32); } -// Process a block of any size where the width is divisible by 16. -static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int w, - int h, uint32_t *sse, int *sum) { - int i, j; - uint32x4_t sum_a = vdupq_n_u32(0); - uint32x4_t sum_b = vdupq_n_u32(0); +// Process a block of width 8 two rows at a time. +static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); uint32x4_t sse_u32 = vdupq_n_u32(0); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const uint8x16_t a = vld1q_u8(src_ptr + j); - const uint8x16_t b = vld1q_u8(ref_ptr + j); + int i = h; + do { + const uint8x16_t s = + vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride)); + const uint8x16_t r = + vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride)); - const uint8x16_t abs_diff = vabdq_u8(a, b); - sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); - sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1)); - sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1)); - } src_ptr += src_stride; ref_ptr += ref_stride; - } + } while (--i != 0); - *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b))); + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); *sse = horizontal_add_uint32x4(sse_u32); } -// Process a block of width 8 two rows at a time. -static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i = 0; - uint32x2_t sum_a = vdup_n_u32(0); - uint32x2_t sum_b = vdup_n_u32(0); - uint32x2_t sse_lo_u32 = vdup_n_u32(0); - uint32x2_t sse_hi_u32 = vdup_n_u32(0); +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + int i = h; do { - const uint8x8_t a_0 = vld1_u8(src_ptr); - const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride); - const uint8x8_t b_0 = vld1_u8(ref_ptr); - const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride); - - const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0); - const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1); - sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0); - sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1); - - sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1)); - sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1)); - sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1)); - sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1)); - - src_ptr += src_stride + src_stride; - ref_ptr += ref_stride + ref_stride; - i += 2; - } while (i < h); + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + j += 16; + } while (j < w); - *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b))); - *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32)); + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); } -#else +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} -// The variance helper functions use int16_t for sum. 8 values are accumulated -// and then added (at which point they expand up to int32_t). To avoid overflow, -// there can be no more than 32767 / 255 ~= 128 values accumulated in each -// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32 -// rows = 128. Asserts have been added to each function to warn against reaching -// this limit. +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} -// Process a block of width 4 four rows at a time. -static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i; +#else // !defined(__ARM_FEATURE_DOTPROD) + +// Process a block of width 4 two rows at a time. +static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + int i = h; - // Since width is only 4, sum_s16 only loads a half row per loop. + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows. assert(h <= 256); - for (i = 0; i < h; i += 4) { - const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride); - const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride); - const uint16x8_t diff_lo_u16 = - vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); - const uint16x8_t diff_hi_u16 = - vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); - - const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); - const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); - - sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); - sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); + do { + const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), - vget_low_s16(diff_lo_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), - vget_high_s16(diff_lo_s16)); + sum_s16 = vaddq_s16(sum_s16, diff); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), - vget_low_s16(diff_hi_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), - vget_high_s16(diff_hi_s16)); + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); - src_ptr += 4 * src_stride; - ref_ptr += 4 * ref_stride; - } + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sse = (uint32_t)horizontal_add_int32x4(sse_s32); } -// Process a block of any size where the width is divisible by 16. -static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int w, - int h, uint32_t *sse, int *sum) { - int i, j; +// Process a block of width 8 one row at a time. +static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); - - // The loop loads 16 values at a time but doubles them up when accumulating - // into sum_s16. - assert(w / 8 * h <= 128); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - const uint8x16_t a_u8 = vld1q_u8(src_ptr + j); - const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j); - - const uint16x8_t diff_lo_u16 = - vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); - const uint16x8_t diff_hi_u16 = - vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8)); - - const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16); - const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16); - - sum_s16 = vaddq_s16(sum_s16, diff_lo_s16); - sum_s16 = vaddq_s16(sum_s16, diff_hi_s16); - - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16), - vget_low_s16(diff_lo_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16), - vget_high_s16(diff_lo_s16)); - - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16), - vget_low_s16(diff_hi_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), - vget_high_s16(diff_hi_s16)); - } + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; + + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128 + assert(h <= 128); + + do { + const uint8x8_t s = vld1_u8(src_ptr); + const uint8x8_t r = vld1_u8(ref_ptr); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); + + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + src_ptr += src_stride; ref_ptr += ref_stride; - } + } while (--i != 0); *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); } -// Process a block of width 8 two rows at a time. -static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, int h, - uint32_t *sse, int *sum) { - int i = 0; - int16x8_t sum_s16 = vdupq_n_s16(0); - int32x4_t sse_lo_s32 = vdupq_n_s32(0); - int32x4_t sse_hi_s32 = vdupq_n_s32(0); +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; - // Each column has it's own accumulator entry in sum_s16. + // Number of rows we can process before 'sum_s16' accumulators overflow: + // 32767 / 255 ~= 128, so 128 16-wide rows. assert(h <= 128); do { - const uint8x8_t a_0_u8 = vld1_u8(src_ptr); - const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride); - const uint8x8_t b_0_u8 = vld1_u8(ref_ptr); - const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride); - const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8); - const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8); - const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16); - const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16); - sum_s16 = vaddq_s16(sum_s16, diff_0_s16); - sum_s16 = vaddq_s16(sum_s16, diff_1_s16); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16), - vget_low_s16(diff_0_s16)); - sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16), - vget_low_s16(diff_1_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16), - vget_high_s16(diff_0_s16)); - sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16), - vget_high_s16(diff_1_s16)); - src_ptr += src_stride + src_stride; - ref_ptr += ref_stride + ref_stride; - i += 2; + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1])); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h, int h_limit, + unsigned int *sse, int *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit + // accumulator overflows. After hitting this limit we accumulate into 32-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + do { + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + j += 16; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]); + sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]); + + h_tmp += h_limit; } while (i < h); - *sum = horizontal_add_int16x8(sum_s16); - *sse = horizontal_add_uint32x4( - vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32))); + *sum = horizontal_add_int32x4(sum_s32); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); } -#endif +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum); +} + +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum); +} + +#endif // defined(__ARM_FEATURE_DOTPROD) void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { - variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); + variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); } void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum) { - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum); + variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum); } -#define VARIANCENXM(n, m, shift) \ - unsigned int vpx_variance##n##x##m##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, unsigned int *sse) { \ - int sum; \ - if (n == 4) \ - variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ - &sum); \ - else if (n == 8) \ - variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ - &sum); \ - else \ - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \ - &sum); \ - if (n * m < 16 * 16) \ - return *sse - ((sum * sum) >> shift); \ - else \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ +#define VARIANCE_WXH_NEON(w, h, shift) \ + unsigned int vpx_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } -VARIANCENXM(4, 4, 4) -VARIANCENXM(4, 8, 5) -VARIANCENXM(8, 4, 5) -VARIANCENXM(8, 8, 6) -VARIANCENXM(8, 16, 7) -VARIANCENXM(16, 8, 7) -VARIANCENXM(16, 16, 8) -VARIANCENXM(16, 32, 9) -VARIANCENXM(32, 16, 9) -VARIANCENXM(32, 32, 10) - -unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1, - &sum1); - variance_neon_w16(src_ptr + (32 * src_stride), src_stride, - ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} +VARIANCE_WXH_NEON(4, 4, 4) +VARIANCE_WXH_NEON(4, 8, 5) -unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, - &sum1); - variance_neon_w16(src_ptr + (16 * src_stride), src_stride, - ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} +VARIANCE_WXH_NEON(8, 4, 5) +VARIANCE_WXH_NEON(8, 8, 6) +VARIANCE_WXH_NEON(8, 16, 7) -unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - - variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, - &sum1); - variance_neon_w16(src_ptr + (16 * src_stride), src_stride, - ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride, - ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride, - ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2, - &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); -} +VARIANCE_WXH_NEON(16, 8, 7) +VARIANCE_WXH_NEON(16, 16, 8) +VARIANCE_WXH_NEON(16, 32, 9) + +VARIANCE_WXH_NEON(32, 16, 9) +VARIANCE_WXH_NEON(32, 32, 10) +VARIANCE_WXH_NEON(32, 64, 11) + +VARIANCE_WXH_NEON(64, 32, 11) +VARIANCE_WXH_NEON(64, 64, 12) -#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1) +#if defined(__ARM_FEATURE_DOTPROD) unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, @@ -421,7 +433,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return vget_lane_u32(sse, 0); } -#else +#else // !defined(__ARM_FEATURE_DOTPROD) unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, @@ -518,4 +530,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse)); } -#endif +#endif // defined(__ARM_FEATURE_DOTPROD) diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c index 06b58c438..b4cdd58c7 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c @@ -31,8 +31,9 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) +#if defined(__aarch64__) && \ + (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) + DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, @@ -53,9 +54,176 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1, - int8x8_t *a2, int8x8_t *a3, - int8x16_t *b, +#if defined(__ARM_FEATURE_MATMUL_INT8) + +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int32x4_t t0, t1, t2, t3; + int16x8_t t01, t23; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, permute_tbl); + t1 = convolve8_4_usdot(s1, filters, permute_tbl); + t2 = convolve8_4_usdot(s2, filters, permute_tbl); + t3 = convolve8_4_usdot(s3, filters, permute_tbl); + t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); + t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); + d01 = vqrshrun_n_s16(t01, 7); + d23 = vqrshrun_n_s16(t23, 7); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, permute_tbl); + d1 = convolve8_8_usdot(s1, filters, permute_tbl); + d2 = convolve8_8_usdot(s2, filters, permute_tbl); + d3 = convolve8_8_usdot(s3, filters, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } +} + +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + uint8x16_t s0, s1, s2, s3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int32x4_t t0, t1, t2, t3; + int16x8_t t01, t23; + uint8x8_t d01, d23, dd01, dd23; + dd01 = vdup_n_u8(0); + dd23 = vdup_n_u8(0); + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filters, permute_tbl); + t1 = convolve8_4_usdot(s1, filters, permute_tbl); + t2 = convolve8_4_usdot(s2, filters, permute_tbl); + t3 = convolve8_4_usdot(s3, filters, permute_tbl); + t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); + t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); + d01 = vqrshrun_n_s16(t01, 7); + d23 = vqrshrun_n_s16(t23, 7); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filters, permute_tbl); + d1 = convolve8_8_usdot(s1, filters, permute_tbl); + d2 = convolve8_8_usdot(s2, filters, permute_tbl); + d3 = convolve8_8_usdot(s3, filters, permute_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } +} + +static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b, const uint8x16_t permute_tbl) { /* Transpose 8-bit elements and concatenate result rows as follows: * a0: 00, 01, 02, 03, XX, XX, XX, XX @@ -70,13 +238,13 @@ static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1, * inline helper is called many times from the same parent function. */ - int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } }; - *b = vqtbl2q_s8(samples, permute_tbl); + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b = vqtbl2q_u8(samples, permute_tbl); } -static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1, - int8x8_t *a2, int8x8_t *a3, - int8x16_t *b0, int8x16_t *b1, +static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b0, uint8x16_t *b1, const uint8x16x2_t permute_tbl) { /* Transpose 8-bit elements and concatenate result rows as follows: * a0: 00, 01, 02, 03, 04, 05, 06, 07 @@ -92,11 +260,364 @@ static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1, * inline helper is called many times from the same parent function. */ - int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } }; - *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); - *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); +} + +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int32x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); + d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } } +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int32x4_t d0, d1, d2, d3; + uint8x8_t d01, d23, dd01, dd23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filters); + d1 = convolve8_4_usdot_partial(s1234, s5678, filters); + d2 = convolve8_4_usdot_partial(s2345, s6789, filters); + d3 = convolve8_4_usdot_partial(s3456, s78910, filters); + d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); + d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); + + dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filters); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filters); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filters); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } +} + +#else // !defined(__ARM_FEATURE_MATMUL_INT8) + void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -125,33 +646,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, int16x8_t t01, t23; uint8x8_t d01, d23; - s0 = vld1q_u8(src); - src += src_stride; - s1 = vld1q_u8(src); - src += src_stride; - s2 = vld1q_u8(src); - src += src_stride; - s3 = vld1q_u8(src); - src += src_stride; - - t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); - t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); - t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); - t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl); t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); d01 = vqrshrun_n_s16(t01, 7); d23 = vqrshrun_n_s16(t23, 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -166,20 +676,18 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; do { - s0 = vld1q_u8(s + 0 * src_stride); - s1 = vld1q_u8(s + 1 * src_stride); - s2 = vld1q_u8(s + 2 * src_stride); - s3 = vld1q_u8(s + 3 * src_stride); + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); - d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); - d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); - d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); - d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); + d0 = + convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl); + d1 = + convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl); + d2 = + convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl); + d3 = + convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; @@ -222,20 +730,12 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, dd01 = vdup_n_u8(0); dd23 = vdup_n_u8(0); - s0 = vld1q_u8(src); - src += src_stride; - s1 = vld1q_u8(src); - src += src_stride; - s2 = vld1q_u8(src); - src += src_stride; - s3 = vld1q_u8(src); - src += src_stride; - - t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl); - t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl); - t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl); - t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl); + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl); + t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl); + t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl); + t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl); t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1)); t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3)); d01 = vqrshrun_n_s16(t01, 7); @@ -243,17 +743,15 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, dd01 = load_u8(dst + 0 * dst_stride, dst_stride); dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -268,29 +766,25 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; do { - s0 = vld1q_u8(s + 0 * src_stride); - s1 = vld1q_u8(s + 1 * src_stride); - s2 = vld1q_u8(s + 2 * src_stride); - s3 = vld1q_u8(s + 3 * src_stride); - - d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl); - d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl); - d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl); - d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl); - - dd0 = vld1_u8(d + 0 * dst_stride); - dd1 = vld1_u8(d + 1 * dst_stride); - dd2 = vld1_u8(d + 2 * dst_stride); - dd3 = vld1_u8(d + 3 * dst_stride); + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = + convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl); + d1 = + convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl); + d2 = + convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl); + d3 = + convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + d0 = vrhadd_u8(d0, dd0); d1 = vrhadd_u8(d1, dd1); d2 = vrhadd_u8(d2, dd2); d3 = vrhadd_u8(d3, dd3); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s += 8; d += 8; @@ -303,6 +797,49 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b = vqtbl2q_s8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b0, + int8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); +} + void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -333,14 +870,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int32x4_t d0, d1, d2, d3; uint8x8_t d01, d23; - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - src += 4 * src_stride; - t4 = vld1_u8(src); - src += src_stride; - t5 = vld1_u8(src); - src += src_stride; - t6 = vld1_u8(src); - src += src_stride; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -357,13 +888,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl); + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; @@ -375,7 +906,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl); + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ samples_LUT.val[0] = s3456; @@ -384,22 +915,15 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); - + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -409,6 +933,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s3456 = s78910; src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -426,14 +951,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s += 4 * src_stride; - t4 = vld1_u8(s); - s += src_stride; - t5 = vld1_u8(s); - s += src_stride; - t6 = vld1_u8(s); - s += src_stride; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -450,19 +969,19 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi, + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); - transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi, + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, tran_concat_tbl); - transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi, + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, tran_concat_tbl); - transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi, + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi, + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, tran_concat_tbl); - transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi, + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, tran_concat_tbl); - transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi, + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, tran_concat_tbl); do { @@ -475,7 +994,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi, + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -491,18 +1010,16 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -556,14 +1073,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, int32x4_t d0, d1, d2, d3; uint8x8_t d01, d23, dd01, dd23; - load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - src += 4 * src_stride; - t4 = vld1_u8(src); - src += src_stride; - t5 = vld1_u8(src); - src += src_stride; - t6 = vld1_u8(src); - src += src_stride; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -580,13 +1091,13 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl); - transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl); - transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl); - transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl); + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; @@ -598,7 +1109,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl); + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ samples_LUT.val[0] = s3456; @@ -607,27 +1118,21 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters); - d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters); - d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters); - d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters); - + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters); d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7); d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7); dd01 = load_u8(dst + 0 * dst_stride, dst_stride); dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + d01 = vrhadd_u8(d01, dd01); d23 = vrhadd_u8(d23, dd23); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -637,6 +1142,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s3456 = s78910; src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; } while (h > 0); } else { @@ -654,14 +1160,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s = src; d = dst; - load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); - s += 4 * src_stride; - t4 = vld1_u8(s); - s += src_stride; - t5 = vld1_u8(s); - s += src_stride; - t6 = vld1_u8(s); - s += src_stride; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); @@ -678,19 +1178,19 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ - transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi, + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); - transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi, + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, tran_concat_tbl); - transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi, + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, tran_concat_tbl); - transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi, + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi, + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, tran_concat_tbl); - transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi, + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, tran_concat_tbl); - transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi, + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, tran_concat_tbl); do { @@ -703,7 +1203,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); - transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi, + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -719,28 +1219,23 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filters); - d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filters); - d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filters); - d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filters); - - dd0 = vld1_u8(d + 0 * dst_stride); - dd1 = vld1_u8(d + 1 * dst_stride); - dd2 = vld1_u8(d + 2 * dst_stride); - dd3 = vld1_u8(d + 3 * dst_stride); + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filters); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filters); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filters); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filters); + + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + d0 = vrhadd_u8(d0, dd0); d1 = vrhadd_u8(d1, dd1); d2 = vrhadd_u8(d2, dd2); d3 = vrhadd_u8(d3, dd3); - vst1_u8(d + 0 * dst_stride, d0); - vst1_u8(d + 1 * dst_stride, d1); - vst1_u8(d + 2 * dst_stride, d2); - vst1_u8(d + 3 * dst_stride, d3); + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -764,29 +1259,11 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#else - -static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, - const uint8x8_t s0, const uint8x8_t s1, - const uint8x8_t s2, const uint8x8_t s3, - const uint8x8_t s4, const uint8x8_t s5, - const uint8x8_t s6, const uint8x8_t s7) { - vst1_u8(s, s0); - s += p; - vst1_u8(s, s1); - s += p; - vst1_u8(s, s2); - s += p; - vst1_u8(s, s3); - s += p; - vst1_u8(s, s4); - s += p; - vst1_u8(s, s5); - s += p; - vst1_u8(s, s6); - s += p; - vst1_u8(s, s7); -} +#endif // defined(__ARM_FEATURE_MATMUL_INT8) + +#else // !(defined(__aarch64__) && + // (defined(__ARM_FEATURE_DOTPROD) || + // defined(__ARM_FEATURE_MATMUL_INT8))) void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -808,16 +1285,13 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, if (h == 4) { uint8x8_t d01, d23; - int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, - d1, d2, d3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; int16x8_t tt0, tt1, tt2, tt3; __builtin_prefetch(src + 0 * src_stride); __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - filter4 = vdup_lane_s16(vget_high_s16(filters), 0); load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); @@ -849,14 +1323,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vget_low_s16(tt2); s10 = vget_low_s16(tt3); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -883,8 +1353,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, w -= 4; } while (w != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int width; const uint8_t *s; uint8x8_t t4, t5, t6, t7; @@ -927,14 +1395,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); transpose_u8_8x4(&t0, &t1, &t2, &t3); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0); @@ -1002,22 +1466,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, - filter4); - t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, - filter4); - t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, - filter4); - t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, - filter3, filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); @@ -1061,8 +1517,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, if (h == 4) { uint8x8_t d01, d23; - int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, - d1, d2, d3; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; int16x8_t tt0, tt1, tt2, tt3; uint32x4_t d0123 = vdupq_n_u32(0); @@ -1070,8 +1525,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - filter4 = vdup_lane_s16(vget_high_s16(filters), 0); load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); @@ -1103,14 +1556,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s9 = vget_low_s16(tt2); s10 = vget_low_s16(tt3); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -1140,8 +1589,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, w -= 4; } while (w != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int width; const uint8_t *s; uint8x8_t t4, t5, t6, t7; @@ -1186,14 +1633,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 5 * src_stride); __builtin_prefetch(src + 6 * src_stride); __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); transpose_u8_8x4(&t0, &t1, &t2, &t3); @@ -1276,22 +1719,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, - filter4); - t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, - filter4); - t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, - filter4); - t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, - filter3, filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); @@ -1349,8 +1784,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; @@ -1387,14 +1820,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -1417,8 +1846,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int height; const uint8_t *s; uint8_t *d; @@ -1469,14 +1896,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); vst1_u8(d, t0); d += dst_stride; @@ -1521,8 +1944,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, src -= 3 * src_stride; if (w == 4) { - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t d01, d23; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; uint32x4_t d0123 = vdupq_n_u32(0); @@ -1560,14 +1981,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(src + 1 * src_stride); __builtin_prefetch(src + 2 * src_stride); __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); @@ -1598,8 +2015,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, h -= 4; } while (h != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int height; const uint8_t *s; uint8_t *d; @@ -1651,14 +2066,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, __builtin_prefetch(s + 1 * src_stride); __builtin_prefetch(s + 2 * src_stride); __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); d01 = vcombine_u8(t0, t1); d23 = vcombine_u8(t2, t3); @@ -1694,4 +2105,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#endif +#endif // #if defined(__aarch64__) && + // (defined(__ARM_FEATURE_DOTPROD) || + // defined(__ARM_FEATURE_MATMUL_INT8)) diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h index 857b6d54e..ed7f18053 100644 --- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h @@ -16,69 +16,12 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" -static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, - uint8x8_t *const s0, uint8x8_t *const s1, - uint8x8_t *const s2, uint8x8_t *const s3) { - *s0 = vld1_u8(s); - s += p; - *s1 = vld1_u8(s); - s += p; - *s2 = vld1_u8(s); - s += p; - *s3 = vld1_u8(s); -} - -static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, - uint8x8_t *const s0, uint8x8_t *const s1, - uint8x8_t *const s2, uint8x8_t *const s3, - uint8x8_t *const s4, uint8x8_t *const s5, - uint8x8_t *const s6, uint8x8_t *const s7) { - *s0 = vld1_u8(s); - s += p; - *s1 = vld1_u8(s); - s += p; - *s2 = vld1_u8(s); - s += p; - *s3 = vld1_u8(s); - s += p; - *s4 = vld1_u8(s); - s += p; - *s5 = vld1_u8(s); - s += p; - *s6 = vld1_u8(s); - s += p; - *s7 = vld1_u8(s); -} +#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) -static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, - uint8x16_t *const s0, uint8x16_t *const s1, - uint8x16_t *const s2, uint8x16_t *const s3, - uint8x16_t *const s4, uint8x16_t *const s5, - uint8x16_t *const s6, uint8x16_t *const s7) { - *s0 = vld1q_u8(s); - s += p; - *s1 = vld1q_u8(s); - s += p; - *s2 = vld1q_u8(s); - s += p; - *s3 = vld1q_u8(s); - s += p; - *s4 = vld1q_u8(s); - s += p; - *s5 = vld1q_u8(s); - s += p; - *s6 = vld1q_u8(s); - s += p; - *s7 = vld1q_u8(s); -} - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \ - (__ARM_FEATURE_DOTPROD == 1) - -static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo, - const int8x16_t samples_hi, - const int32x4_t correction, - const int8x8_t filters) { +static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int32x4_t correction, + const int8x8_t filters) { /* Sample range-clamping and permutation are performed by the caller. */ int32x4_t sum; @@ -90,11 +33,11 @@ static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo, return sum; } -static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x2_t permute_tbl) { +static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x2_t permute_tbl) { int8x16_t clamped_samples, permuted_samples[2]; int32x4_t sum; @@ -115,12 +58,12 @@ static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples, return sum; } -static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo, - const int8x16_t samples0_hi, - const int8x16_t samples1_lo, - const int8x16_t samples1_hi, - const int32x4_t correction, - const int8x8_t filters) { +static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int32x4_t correction, + const int8x8_t filters) { /* Sample range-clamping and permutation are performed by the caller. */ int32x4_t sum0, sum1; int16x8_t sum; @@ -138,11 +81,11 @@ static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo, return vqrshrun_n_s16(sum, 7); } -static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples, - const int8x8_t filters, - const int32x4_t correction, - const uint8x16_t range_limit, - const uint8x16x3_t permute_tbl) { +static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { int8x16_t clamped_samples, permuted_samples[3]; int32x4_t sum0, sum1; int16x8_t sum; @@ -171,15 +114,98 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples, return vqrshrun_n_s16(sum, 7); } -#endif +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) + +#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) + +static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + /* Sample permutation is performed by the caller. */ + int32x4_t sum; + + sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); + + /* Narrowing and packing is performed by the caller. */ + return sum; +} + +static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + uint8x16_t permuted_samples[2]; + int32x4_t sum; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); + + /* Narrowing and packing is performed by the caller. */ + return sum; +} + +static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filters) { + /* Sample permutation is performed by the caller. */ + int32x4_t sum0, sum1; + int16x8_t sum; + + /* First 4 output values. */ + sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); + /* Second 4 output values. */ + sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, 7); +} + +static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + uint8x16_t permuted_samples[3]; + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + /* First 4 output values. */ + sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + /* Second 4 output values. */ + sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, 7); +} + +#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, const int16x4_t s6, const int16x4_t s7, - const int16x8_t filters, - const int16x4_t filter3, - const int16x4_t filter4) { + const int16x8_t filters) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); int16x4_t sum; @@ -190,8 +216,8 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, sum = vmla_lane_s16(sum, s5, filters_hi, 1); sum = vmla_lane_s16(sum, s6, filters_hi, 2); sum = vmla_lane_s16(sum, s7, filters_hi, 3); - sum = vqadd_s16(sum, vmul_s16(s3, filter3)); - sum = vqadd_s16(sum, vmul_s16(s4, filter4)); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3)); + sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0)); return sum; } @@ -199,9 +225,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, const int16x8_t s6, const int16x8_t s7, - const int16x8_t filters, - const int16x8_t filter3, - const int16x8_t filter4) { + const int16x8_t filters) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); int16x8_t sum; @@ -212,15 +236,13 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); - sum = vqaddq_s16(sum, vmulq_s16(s3, filter3)); - sum = vqaddq_s16(sum, vmulq_s16(s4, filter4)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0)); return vqrshrun_n_s16(sum, 7); } static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, const int16x8_t filters) { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); int16x8_t ss[8]; ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); @@ -233,7 +255,7 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7])); return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7], - filters, filter3, filter4); + filters); } #endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ diff --git a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c index 8edf8a66e..b8e3c5e54 100644 --- a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c +++ b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/arm/vpx_convolve8_neon.h" #include "vpx_ports/mem.h" @@ -38,8 +39,6 @@ static INLINE void scaledconvolve_horiz_w4( const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t s[8], d; int16x8_t ss[4]; int16x4_t t[8], tt; @@ -61,7 +60,7 @@ static INLINE void scaledconvolve_horiz_w4( t[7] = vget_high_s16(ss[3]); tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], - filters, filter3, filter4); + filters); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); } else { @@ -167,8 +166,6 @@ static INLINE void scaledconvolve_vert_w4( if (y_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t s[8], d; int16x4_t t[8], tt; @@ -183,8 +180,7 @@ static INLINE void scaledconvolve_vert_w4( t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); - tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters, - filter3, filter4); + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); } else { diff --git a/libvpx/vpx_dsp/avg.c b/libvpx/vpx_dsp/avg.c index 1c45e8a73..954015407 100644 --- a/libvpx/vpx_dsp/avg.c +++ b/libvpx/vpx_dsp/avg.c @@ -7,6 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + +#include <assert.h> #include <stdlib.h> #include "./vpx_dsp_rtcd.h" @@ -344,6 +346,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height) { int idx; const int norm_factor = height >> 1; + assert(height >= 2); for (idx = 0; idx < 16; ++idx) { int i; hbuf[idx] = 0; diff --git a/libvpx/vpx_dsp/bitwriter.h b/libvpx/vpx_dsp/bitwriter.h index 04084af8f..5f1ee69ec 100644 --- a/libvpx/vpx_dsp/bitwriter.h +++ b/libvpx/vpx_dsp/bitwriter.h @@ -13,6 +13,7 @@ #include <stdio.h> +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/mem.h" #include "vpx_dsp/prob.h" @@ -35,7 +36,9 @@ typedef struct vpx_writer { void vpx_start_encode(vpx_writer *br, uint8_t *source); void vpx_stop_encode(vpx_writer *br); -static INLINE void vpx_write(vpx_writer *br, int bit, int probability) { +static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br, + int bit, + int probability) { unsigned int split; int count = br->count; unsigned int range = br->range; diff --git a/libvpx/vpx_dsp/loongarch/quantize_lsx.c b/libvpx/vpx_dsp/loongarch/quantize_lsx.c index 2fc33b06b..77be0bb4f 100644 --- a/libvpx/vpx_dsp/loongarch/quantize_lsx.c +++ b/libvpx/vpx_dsp/loongarch/quantize_lsx.c @@ -59,7 +59,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff, } static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, - __m128i zbin_mask0, __m128i zbin_mask1, const int16_t *scan, int index, __m128i zero) { const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); @@ -68,8 +67,6 @@ static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, __m128i scan1 = __lsx_vld(scan + index + 8, 0); __m128i eob0, eob1; - scan0 = __lsx_vsub_h(scan0, zbin_mask0); - scan1 = __lsx_vsub_h(scan1, zbin_mask1); eob0 = __lsx_vandn_v(zero_coeff0, scan0); eob1 = __lsx_vandn_v(zero_coeff1, scan1); return __lsx_vmax_h(eob0, eob1); @@ -138,7 +135,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, dequant = __lsx_vilvh_d(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -161,8 +158,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); index += 16; @@ -221,7 +217,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr); dequant = __lsx_vilvh_d(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); // AC only loop. for (index = 16; index < 32 * 32; index += 16) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -243,8 +239,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8 + index); - eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); } diff --git a/libvpx/vpx_dsp/loopfilter.c b/libvpx/vpx_dsp/loopfilter.c index 995602831..d6504aab1 100644 --- a/libvpx/vpx_dsp/loopfilter.c +++ b/libvpx/vpx_dsp/loopfilter.c @@ -159,7 +159,7 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } -static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, +static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, uint8_t *oq3) { @@ -232,8 +232,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } -static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint8_t *op7, uint8_t *op6, +static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op7, uint8_t *op6, uint8_t *op5, uint8_t *op4, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, @@ -505,7 +505,7 @@ void vpx_highbd_lpf_vertical_4_dual_c( bd); } -static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, int bd) { @@ -584,8 +584,8 @@ void vpx_highbd_lpf_vertical_8_dual_c( bd); } -static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint16_t *op7, uint16_t *op6, +static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint16_t *op7, uint16_t *op6, uint16_t *op5, uint16_t *op4, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, diff --git a/libvpx/vpx_dsp/mips/macros_msa.h b/libvpx/vpx_dsp/mips/macros_msa.h index 3c2f50c79..d54ce5368 100644 --- a/libvpx/vpx_dsp/mips/macros_msa.h +++ b/libvpx/vpx_dsp/mips/macros_msa.h @@ -83,31 +83,33 @@ val_lh_m; \ }) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ - uint32_t val_lw_m; \ - \ - __asm__ __volatile__("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ - "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ - : [val_lw_m] "=&r"(val_lw_m) \ - : [psrc_lw_m] "r"(psrc_lw_m)); \ - \ - val_lw_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ + uint32_t val_lw_m; \ + \ + __asm__ __volatile__( \ + "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ + "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ + : [val_lw_m] "=&r"(val_lw_m) \ + : [psrc_lw_m] "r"(psrc_lw_m)); \ + \ + val_lw_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ - uint64_t val_ld_m = 0; \ - \ - __asm__ __volatile__("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ - "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ - : [val_ld_m] "=&r"(val_ld_m) \ - : [psrc_ld_m] "r"(psrc_ld_m)); \ - \ - val_ld_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint64_t val_ld_m = 0; \ + \ + __asm__ __volatile__( \ + "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ + "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ + : [val_ld_m] "=&r"(val_ld_m) \ + : [psrc_ld_m] "r"(psrc_ld_m)); \ + \ + val_ld_m; \ }) #else // !(__mips == 64) #define LD(psrc) \ diff --git a/libvpx/vpx_dsp/ppc/quantize_vsx.c b/libvpx/vpx_dsp/ppc/quantize_vsx.c index 7cdcbeb40..ab71f6e23 100644 --- a/libvpx/vpx_dsp/ppc/quantize_vsx.c +++ b/libvpx/vpx_dsp/ppc/quantize_vsx.c @@ -78,11 +78,10 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); } -static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask, +static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, const int16_t *iscan_ptr, int index) { int16x8_t scan = vec_vsx_ld(index, iscan_ptr); bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); - scan = vec_sub(scan, mask); return vec_andc(scan, zero_coeff); } @@ -139,8 +138,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); - eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), - nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); if (n_coeffs > 16) { int index = 16; @@ -177,10 +176,9 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); - eob = - vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); - eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), - nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); eob = vec_max(eob, eob2); index += 24; @@ -252,8 +250,8 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = vec_splat(dequant, 1); // remove DC from dequant vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr); - eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), - nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); do { int16x8_t coeff2, coeff2_abs, qcoeff2, eob2; @@ -286,9 +284,9 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr); vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr); - eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); - eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), - nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); eob = vec_max(eob, eob2); // 24 int16_t is 48 bytes diff --git a/libvpx/vpx_dsp/psnr.c b/libvpx/vpx_dsp/psnr.c index 48bac0450..f0d4e927a 100644 --- a/libvpx/vpx_dsp/psnr.c +++ b/libvpx/vpx_dsp/psnr.c @@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) { /* TODO(yaowu): The block_variance calls the unoptimized versions of variance() * and highbd_8_variance(). It should not. */ -static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, unsigned int *sse, - int *sum) { +static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { int i, j; - - *sum = 0; - *sse = 0; + int64_t sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } + + return sse; } #if CONFIG_VP9_HIGHBITDEPTH -static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, uint64_t *sse, int64_t *sum) { +static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h) { int i, j; + int64_t sse = 0; uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } -} -static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, - &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; + return sse; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, const int dw = width % 16; const int dh = height % 16; int64_t total_sse = 0; - unsigned int sse = 0; - int sum = 0; int x, y; if (dw > 0) { - encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, - height, &sse, &sum); - total_sse += sse; + total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height); } if (dh > 0) { - encoder_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, width - dw, dh, - &sse, &sum); - total_sse += sse; + total_sse += + encoder_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; + unsigned int sse; for (x = 0; x < width / 16; ++x) { vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; @@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, int x, y; const int dw = width % 16; const int dh = height % 16; - unsigned int sse = 0; - int sum = 0; if (dw > 0) { - encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], - b_stride, dw, height, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height); } if (dh > 0) { - encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; + unsigned int sse; for (x = 0; x < width / 16; ++x) { vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; diff --git a/libvpx/vpx_dsp/variance.c b/libvpx/vpx_dsp/variance.c index 30b55dcb4..ce1e8382b 100644 --- a/libvpx/vpx_dsp/variance.c +++ b/libvpx/vpx_dsp/variance.c @@ -549,9 +549,9 @@ HIGHBD_MSE(16, 8) HIGHBD_MSE(8, 16) HIGHBD_MSE(8, 8) -void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint16_t *pred, - int width, int height, const uint16_t *ref, - int ref_stride) { +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { int i, j; for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk index 13999af04..1fd9495cf 100644 --- a/libvpx/vpx_dsp/vpx_dsp.mk +++ b/libvpx/vpx_dsp/vpx_dsp.mk @@ -226,19 +226,19 @@ DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h ifeq ($(VPX_ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm endif -DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h -DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct4x4_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct8x8_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_dct32x32_lsx.c endif # !CONFIG_VP9_HIGHBITDEPTH @@ -326,11 +326,14 @@ DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c +DSP_SRCS-$(HAVE_AVX2) += x86/quantize_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_quantize_neon.c endif # avg @@ -374,6 +377,7 @@ DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/subtract_avx2.c DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm @@ -388,6 +392,9 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS @@ -425,6 +432,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index d3c668f9a..8725821b6 100644 --- a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -527,6 +527,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4_1 sse2 neon/; + specialize qw/vpx_highbd_fdct4x4_1 neon/; + $vpx_highbd_fdct4x4_1_neon=vpx_fdct4x4_1_neon; add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct8x8 neon sse2/; @@ -550,27 +552,29 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct32x32_1 sse2 neon/; add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct4x4 sse2/; + specialize qw/vpx_highbd_fdct4x4 sse2 neon/; add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct8x8 sse2/; + specialize qw/vpx_highbd_fdct8x8 sse2 neon/; add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct8x8_1 neon/; $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon; add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct16x16 sse2/; + specialize qw/vpx_highbd_fdct16x16 sse2 neon/; add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vpx_highbd_fdct16x16_1 neon/; add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct32x32 sse2/; + specialize qw/vpx_highbd_fdct32x32 sse2 neon/; add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_highbd_fdct32x32_rd sse2/; + specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/; add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vpx_highbd_fdct32x32_1 neon/; } else { add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct4x4 neon sse2 msa lsx/; @@ -711,17 +715,17 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/; + specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/; + specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_highbd_quantize_b sse2/; + specialize qw/vpx_highbd_quantize_b neon sse2 avx2/; add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_highbd_quantize_b_32x32 sse2/; + specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER @@ -730,7 +734,7 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") { # Block subtraction # add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/; +specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/; # # Single block SAD @@ -795,7 +799,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_32x32 sse2 avx2/; + specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/; add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; specialize qw/vpx_highbd_hadamard_8x8 avx2/; @@ -819,7 +823,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/; add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_32x32 sse2 avx2/; + specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; specialize qw/vpx_satd avx2 sse2 neon msa/; @@ -935,46 +939,49 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Block subtraction # add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd"; + specialize qw/vpx_highbd_subtract_block neon avx2/; # # Single block SAD # add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad64x64 sse2/; + specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad64x32 sse2/; + specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x64 sse2/; + specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x32 sse2/; + specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad32x16 sse2/; + specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x32 sse2/; + specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x16 sse2/; + specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad16x8 sse2/; + specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad8x16 sse2/; + specialize qw/vpx_highbd_sad8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad8x8 sse2/; + specialize qw/vpx_highbd_sad8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - specialize qw/vpx_highbd_sad8x4 sse2/; + specialize qw/vpx_highbd_sad8x4 sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad4x8 neon/; add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + specialize qw/vpx_highbd_sad4x4 neon/; # # Avg @@ -988,83 +995,85 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max"; add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad64x64_avg sse2/; + specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad64x32_avg sse2/; + specialize qw/vpx_highbd_sad64x32_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x64_avg sse2/; + specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x32_avg sse2/; + specialize qw/vpx_highbd_sad32x32_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad32x16_avg sse2/; + specialize qw/vpx_highbd_sad32x16_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x32_avg sse2/; + specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x16_avg sse2/; + specialize qw/vpx_highbd_sad16x16_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad16x8_avg sse2/; + specialize qw/vpx_highbd_sad16x8_avg sse2 neon avx2/; add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad8x16_avg sse2/; + specialize qw/vpx_highbd_sad8x16_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad8x8_avg sse2/; + specialize qw/vpx_highbd_sad8x8_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - specialize qw/vpx_highbd_sad8x4_avg sse2/; + specialize qw/vpx_highbd_sad8x4_avg sse2 neon/; add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vpx_highbd_sad4x8_avg neon/; add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + specialize qw/vpx_highbd_sad4x4_avg neon/; # # Multi-block SAD, comparing a reference to N independent blocks # add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad64x64x4d sse2/; + specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad64x32x4d sse2/; + specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x64x4d sse2/; + specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x32x4d sse2/; + specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad32x16x4d sse2/; + specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x32x4d sse2/; + specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x16x4d sse2/; + specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad16x8x4d sse2/; + specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/; add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad8x16x4d sse2/; + specialize qw/vpx_highbd_sad8x16x4d sse2 neon/; add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad8x8x4d sse2/; + specialize qw/vpx_highbd_sad8x8x4d sse2 neon/; add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad8x4x4d sse2/; + specialize qw/vpx_highbd_sad8x4x4d sse2 neon/; add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad4x8x4d sse2/; + specialize qw/vpx_highbd_sad4x8x4d sse2 neon/; add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; - specialize qw/vpx_highbd_sad4x4x4d sse2/; + specialize qw/vpx_highbd_sad4x4x4d sse2 neon/; # # Structured Similarity (SSIM) @@ -1232,369 +1241,397 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance64x64 sse2/; + specialize qw/vpx_highbd_12_variance64x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance64x32 sse2/; + specialize qw/vpx_highbd_12_variance64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x64 sse2/; + specialize qw/vpx_highbd_12_variance32x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x32 sse2/; + specialize qw/vpx_highbd_12_variance32x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance32x16 sse2/; + specialize qw/vpx_highbd_12_variance32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x32 sse2/; + specialize qw/vpx_highbd_12_variance16x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x16 sse2/; + specialize qw/vpx_highbd_12_variance16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance16x8 sse2/; + specialize qw/vpx_highbd_12_variance16x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x16 sse2/; + specialize qw/vpx_highbd_12_variance8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_variance8x8 sse2/; + specialize qw/vpx_highbd_12_variance8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance8x4 neon/; add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance4x8 neon/; add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance4x4 neon/; add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance64x64 sse2/; + specialize qw/vpx_highbd_10_variance64x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance64x32 sse2/; + specialize qw/vpx_highbd_10_variance64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x64 sse2/; + specialize qw/vpx_highbd_10_variance32x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x32 sse2/; + specialize qw/vpx_highbd_10_variance32x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance32x16 sse2/; + specialize qw/vpx_highbd_10_variance32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x32 sse2/; + specialize qw/vpx_highbd_10_variance16x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x16 sse2/; + specialize qw/vpx_highbd_10_variance16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance16x8 sse2/; + specialize qw/vpx_highbd_10_variance16x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x16 sse2/; + specialize qw/vpx_highbd_10_variance8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_variance8x8 sse2/; + specialize qw/vpx_highbd_10_variance8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance8x4 neon/; add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance4x8 neon/; add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance4x4 neon/; add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance64x64 sse2/; + specialize qw/vpx_highbd_8_variance64x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance64x32 sse2/; + specialize qw/vpx_highbd_8_variance64x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x64 sse2/; + specialize qw/vpx_highbd_8_variance32x64 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x32 sse2/; + specialize qw/vpx_highbd_8_variance32x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance32x16 sse2/; + specialize qw/vpx_highbd_8_variance32x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x32 sse2/; + specialize qw/vpx_highbd_8_variance16x32 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x16 sse2/; + specialize qw/vpx_highbd_8_variance16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance16x8 sse2/; + specialize qw/vpx_highbd_8_variance16x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x16 sse2/; + specialize qw/vpx_highbd_8_variance8x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_variance8x8 sse2/; + specialize qw/vpx_highbd_8_variance8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance8x4 neon/; add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance4x8 neon/; add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance4x4 neon/; add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_8_get16x16var sse2/; + specialize qw/vpx_highbd_8_get16x16var sse2 neon/; add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_8_get8x8var sse2/; + specialize qw/vpx_highbd_8_get8x8var sse2 neon/; add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_10_get16x16var sse2/; + specialize qw/vpx_highbd_10_get16x16var sse2 neon/; add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_10_get8x8var sse2/; + specialize qw/vpx_highbd_10_get8x8var sse2 neon/; add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_12_get16x16var sse2/; + specialize qw/vpx_highbd_12_get16x16var sse2 neon/; add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_highbd_12_get8x8var sse2/; + specialize qw/vpx_highbd_12_get8x8var sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse16x16 sse2/; + specialize qw/vpx_highbd_8_mse16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse16x8 neon/; add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse8x16 neon/; add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_8_mse8x8 sse2/; + specialize qw/vpx_highbd_8_mse8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse16x16 sse2/; + specialize qw/vpx_highbd_10_mse16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse16x8 neon/; add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse8x16 neon/; add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_10_mse8x8 sse2/; + specialize qw/vpx_highbd_10_mse8x8 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse16x16 sse2/; + specialize qw/vpx_highbd_12_mse16x16 sse2 neon/; add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse16x8 neon/; add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse8x16 neon/; add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_highbd_12_mse8x8 sse2/; + specialize qw/vpx_highbd_12_mse8x8 sse2 neon/; add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride"; + specialize qw/vpx_highbd_comp_avg_pred neon sse2/; # # Subpixel Variance # add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/; add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/; } # CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c index 3f4f577a2..b2e01319d 100644 --- a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c +++ b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c @@ -104,7 +104,7 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); - src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride)); src32[0] = _mm256_cvtepi16_epi32(src16[0]); src32[1] = _mm256_cvtepi16_epi32(src16[1]); @@ -304,7 +304,7 @@ static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); - src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride)); hadamard_col8x2_avx2(src, 0); hadamard_col8x2_avx2(src, 1); diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c index 9da2f34c9..015c11a1f 100644 --- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c @@ -164,7 +164,7 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) { s0 = _mm_add_epi32(s0, s1); s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8)); s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4)); - avg = _mm_cvtsi128_si32(s0); + avg = (unsigned int)_mm_cvtsi128_si32(s0); return (avg + 32) >> 6; } @@ -275,7 +275,7 @@ static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride)); hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); diff --git a/libvpx/vpx_dsp/x86/convolve_avx2.h b/libvpx/vpx_dsp/x86/convolve_avx2.h index 99bc9637f..ebee964b1 100644 --- a/libvpx/vpx_dsp/x86/convolve_avx2.h +++ b/libvpx/vpx_dsp/x86/convolve_avx2.h @@ -129,9 +129,8 @@ static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1, static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1, __m128i *const dst_ptr_2, const __m256i *const src) { - *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); - *((uint32_t *)(dst_ptr_2)) = - _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); + *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); + *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); } static INLINE __m256i mm256_round_epi32(const __m256i *const src, diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h index 3f158b5e4..f3a802029 100644 --- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h +++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h @@ -89,7 +89,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) { const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); - const __m256i kZero = _mm256_set1_epi16(0); + const __m256i kZero = _mm256_setzero_si256(); const __m256i kOne = _mm256_set1_epi16(1); // Do the two transform/transpose passes int pass; diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h index ac1246faa..bf350b6da 100644 --- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h +++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h @@ -100,7 +100,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); const __m128i kOne = _mm_set1_epi16(1); // Do the two transform/transpose passes diff --git a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h index 78cf9111d..1d07391b0 100644 --- a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -249,7 +249,7 @@ static INLINE void highbd_idct16_4col_stage7(const __m128i *const in, static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1, const int bd) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); // Faster than _mm_set1_epi16((1 << bd) - 1). const __m128i one = _mm_set1_epi16(1); const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); diff --git a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c index d265fc1a9..9f45623de 100644 --- a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -18,7 +18,7 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { __m128i lbounded; __m128i retval; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); __m128i t80, max, min; @@ -51,7 +51,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); __m128i blimit_v, limit_v, thresh_v; __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; @@ -492,7 +492,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); @@ -720,7 +720,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c new file mode 100644 index 000000000..8edddd637 --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" + +static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i sign = _mm_srai_epi16(*p, 15); + const __m128i dc = _mm_unpacklo_epi16(*p, sign); + const __m128i ac = _mm_unpackhi_epi16(*p, sign); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void update_qp(__m256i *qp) { + int i; + for (i = 0; i < 5; ++i) { + qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); + } +} + +static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *dequant_ptr, + const int16_t *quant_shift_ptr, + __m256i *qp, int log_scale) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); + const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); + init_one_qp(&zbin, &qp[0]); + init_one_qp(&round, &qp[1]); + init_one_qp(&quant, &qp[2]); + init_one_qp(&dequant, &qp[3]); + init_one_qp(&quant_shift, &qp[4]); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1))); + qp[0] = _mm256_add_epi32(qp[0], rnd); + qp[0] = _mm256_srai_epi32(qp[0], log_scale); + + qp[1] = _mm256_add_epi32(qp[1], rnd); + qp[1] = _mm256_srai_epi32(qp[1], log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1)); +} + +// Note: +// *x is vector multiplied by *y which is 16 int32_t parallel multiplication +// and right shift 16. The output, 16 int32_t is save in *p. +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x, + const __m256i *y) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr, + __m256i eobmax, + __m256i nz_mask) { + const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +// Get the max eob from the lower 128 bits. +static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); +#if defined(_MSC_VER) && (_MSC_VER < 1910) + return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; +#else + return (uint16_t)_mm256_extract_epi16(eob, 0); +#endif +} + +static VPX_FORCE_INLINE void quantize(const __m256i *qp, + const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (_mm256_movemask_epi8(zbin_mask) == 0) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } +} + +void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int step = 8; + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + (void)scan; + + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0); + + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} + +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE void quantize_b_32x32( + const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (_mm256_movemask_epi8(zbin_mask) == 0) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + + { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } +} + +void vpx_highbd_quantize_b_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const unsigned int step = 8; + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + (void)scan; + + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); + + quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 4535a0f7a..ae1981a83 100644 --- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -25,7 +25,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + int i, j, non_zero_regs = (int)count / 4, eob_i = 0; __m128i zbins[2]; __m128i nzbins[2]; @@ -82,13 +82,14 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; const uint32_t abs_qcoeff = (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + qcoeff_ptr[k] = + (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j]; dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } - *eob_ptr = eob_i + 1; + *eob_ptr = eob_i; } void vpx_highbd_quantize_b_32x32_sse2( @@ -101,7 +102,7 @@ void vpx_highbd_quantize_b_32x32_sse2( __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; - int i, eob = -1; + int i, eob = 0; const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; @@ -143,10 +144,10 @@ void vpx_highbd_quantize_b_32x32_sse2( const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } - *eob_ptr = eob + 1; + *eob_ptr = eob; } #endif diff --git a/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c b/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c new file mode 100644 index 000000000..947b5e977 --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t sad_array[4]) { + const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); + const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); + const __m256i t2 = _mm256_hadd_epi32(t0, t1); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), + _mm256_extractf128_si256(t2, 1)); + _mm_storeu_si128((__m128i *)sad_array, sum); +} + +static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + int x; + + for (x = 0; x < 4; ++x) { + __m256i r[4]; + r[0] = _mm256_loadu_si256((const __m256i *)refs[x]); + r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48)); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3)); + + // sum every abs diff + sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1])); + sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3])); + } + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +#define HIGHBD_SAD64XNX4D(n) \ + void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *refs[4]; \ + __m256i sums_16[4]; \ + __m256i sums_32[4]; \ + int i; \ + \ + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ + sums_32[0] = _mm256_setzero_si256(); \ + sums_32[1] = _mm256_setzero_si256(); \ + sums_32[2] = _mm256_setzero_si256(); \ + sums_32[3] = _mm256_setzero_si256(); \ + \ + for (i = 0; i < (n / 2); ++i) { \ + sums_16[0] = _mm256_setzero_si256(); \ + sums_16[1] = _mm256_setzero_si256(); \ + sums_16[2] = _mm256_setzero_si256(); \ + sums_16[3] = _mm256_setzero_si256(); \ + \ + highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32[0] = _mm256_add_epi32( \ + sums_32[0], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[0], 1)))); \ + sums_32[1] = _mm256_add_epi32( \ + sums_32[1], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[1], 1)))); \ + sums_32[2] = _mm256_add_epi32( \ + sums_32[2], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[2], 1)))); \ + sums_32[3] = _mm256_add_epi32( \ + sums_32[3], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[3], 1)))); \ + \ + src += src_stride << 1; \ + } \ + calc_final_4(sums_32, sad_array); \ + } + +// 64x64 +HIGHBD_SAD64XNX4D(64) + +// 64x32 +HIGHBD_SAD64XNX4D(32) + +static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[8]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16)); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16)); + r[4] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16)); + r[6] = _mm256_loadu_si256((const __m256i *)refs[3]); + r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16)); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2)); + r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s)); + r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2)); + r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s)); + r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1])); + sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3])); + sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5])); + sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7])); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +#define HIGHBD_SAD32XNX4D(n) \ + void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *refs[4]; \ + __m256i sums_16[4]; \ + __m256i sums_32[4]; \ + int i; \ + \ + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \ + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \ + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \ + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \ + sums_32[0] = _mm256_setzero_si256(); \ + sums_32[1] = _mm256_setzero_si256(); \ + sums_32[2] = _mm256_setzero_si256(); \ + sums_32[3] = _mm256_setzero_si256(); \ + \ + for (i = 0; i < (n / 8); ++i) { \ + sums_16[0] = _mm256_setzero_si256(); \ + sums_16[1] = _mm256_setzero_si256(); \ + sums_16[2] = _mm256_setzero_si256(); \ + sums_16[3] = _mm256_setzero_si256(); \ + \ + highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32[0] = _mm256_add_epi32( \ + sums_32[0], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[0], 1)))); \ + sums_32[1] = _mm256_add_epi32( \ + sums_32[1], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[1], 1)))); \ + sums_32[2] = _mm256_add_epi32( \ + sums_32[2], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[2], 1)))); \ + sums_32[3] = _mm256_add_epi32( \ + sums_32[3], \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \ + _mm256_cvtepu16_epi32( \ + _mm256_extractf128_si256(sums_16[3], 1)))); \ + \ + src += src_stride << 3; \ + } \ + calc_final_4(sums_32, sad_array); \ + } + +// 32x64 +HIGHBD_SAD32XNX4D(64) + +// 32x32 +HIGHBD_SAD32XNX4D(32) + +// 32x16 +HIGHBD_SAD32XNX4D(16) + +static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[4]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]); + sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]); + sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]); + sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < 2; ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 4; + } + calc_final_4(sums_32, sad_array); +} + +void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} + +void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} diff --git a/libvpx/vpx_dsp/x86/highbd_sad_avx2.c b/libvpx/vpx_dsp/x86/highbd_sad_avx2.c new file mode 100644 index 000000000..231b67f80 --- /dev/null +++ b/libvpx/vpx_dsp/x86/highbd_sad_avx2.c @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) { + const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8)); + const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4)); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1), + _mm256_extractf128_si256(t1, 1)); + return (unsigned int)_mm_cvtsi128_si32(sum); +} + +static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2)); + const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3)); + // sum every abs diff + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); + + src += src_stride; + ref += ref_stride; + } +} + +#define HIGHBD_SAD64XN(n) \ + unsigned int vpx_highbd_sad64x##n##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 2); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 1; \ + ref += ref_stride << 1; \ + } \ + return calc_final(sums_32); \ + } + +// 64x64 +HIGHBD_SAD64XN(64) + +// 64x32 +HIGHBD_SAD64XN(32) + +static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + } +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 8); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 3; \ + ref += ref_stride << 3; \ + } \ + return calc_final(sums_32); \ + } + +// 32x64 +HIGHBD_SAD32XN(64) + +// 32x32 +HIGHBD_SAD32XN(32) + +// 32x16 +HIGHBD_SAD32XN(16) + +static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + } +} + +unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < 2; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + } + return calc_final(sums_32); +} + +unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +// AVG ------------------------------------------------------------------------- +static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32)); + const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + const __m256i avg2 = _mm256_avg_epu16(r2, x2); + const __m256i avg3 = _mm256_avg_epu16(r3, x3); + // absolute differences between every ref/pred avg to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2)); + const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3)); + // sum every abs diff + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); + + src += src_stride; + ref += ref_stride; + sec += 64; + } +} + +#define HIGHBD_SAD64XN_AVG(n) \ + unsigned int vpx_highbd_sad64x##n##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 2); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 1; \ + ref += ref_stride << 1; \ + sec += 64 << 1; \ + } \ + return calc_final(sums_32); \ + } + +// 64x64 +HIGHBD_SAD64XN_AVG(64) + +// 64x32 +HIGHBD_SAD64XN_AVG(32) + +static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + // absolute differences between every ref/pred avg to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + sec += 32; + } +} + +#define HIGHBD_SAD32XN_AVG(n) \ + unsigned int vpx_highbd_sad32x##n##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 8); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 3; \ + ref += ref_stride << 3; \ + sec += 32 << 3; \ + } \ + return calc_final(sums_32); \ + } + +// 32x64 +HIGHBD_SAD32XN_AVG(64) + +// 32x32 +HIGHBD_SAD32XN_AVG(32) + +// 32x16 +HIGHBD_SAD32XN_AVG(16) + +static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + sec += 32; + } +} + +unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < 2; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + sec += 16 << 4; + } + return calc_final(sums_32); +} + +unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} diff --git a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c index 7c8d79b09..381e0ad19 100644 --- a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c +++ b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -559,3 +560,49 @@ FNS(sse2) #undef FNS #undef FN + +void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + if (width > 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]); + _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1)); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]); + _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1)); + comp_pred += 8 << 1; + pred += 8 << 1; + ref += ref_stride << 1; + } + } else { + assert(width == 4); + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]); + const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]); + _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1)); + comp_pred += 4 << 1; + pred += 4 << 1; + ref += ref_stride << 1; + } + } +} diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c index 4b02da966..f42b3df84 100644 --- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -243,7 +243,7 @@ void iadst8_sse2(__m128i *const in) { const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); __m128i s[8], u[16], v[8], w[16]; // transpose @@ -546,7 +546,7 @@ void vpx_iadst16_8col_sse2(__m128i *const in) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); u[0] = _mm_unpacklo_epi16(in[15], in[0]); u[1] = _mm_unpackhi_epi16(in[15], in[0]); diff --git a/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/libvpx/vpx_dsp/x86/loopfilter_avx2.c index be391992a..a58fb6553 100644 --- a/libvpx/vpx_dsp/x86/loopfilter_avx2.c +++ b/libvpx/vpx_dsp/x86/loopfilter_avx2.c @@ -18,7 +18,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch, const unsigned char *limit, const unsigned char *thresh) { __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; @@ -372,7 +372,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch, const unsigned char *limit, const unsigned char *thresh) { __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; diff --git a/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libvpx/vpx_dsp/x86/loopfilter_sse2.c index 347c9fdbe..6ea34cdd1 100644 --- a/libvpx/vpx_dsp/x86/loopfilter_sse2.c +++ b/libvpx/vpx_dsp/x86/loopfilter_sse2.c @@ -106,7 +106,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i limit_v = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), _mm_loadl_epi64((const __m128i *)limit)); @@ -140,7 +140,7 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i limit_v = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), _mm_loadl_epi64((const __m128i *)limit)); @@ -232,7 +232,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); const __m128i limit_v = _mm_load_si128((const __m128i *)limit); @@ -594,7 +594,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); const __m128i limit_v = _mm_load_si128((const __m128i *)limit); @@ -932,7 +932,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch, DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); const __m128i limit_v = _mm_load_si128((const __m128i *)limit); const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); @@ -1152,7 +1152,7 @@ void vpx_lpf_horizontal_8_dual_sse2( DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i blimit = _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), _mm_load_si128((const __m128i *)blimit1)); @@ -1406,7 +1406,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch, const __m128i thresh = _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), _mm_load_si128((const __m128i *)thresh1)); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i mask, hev, flat; diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h index 8b6d4d1dd..031f361a4 100644 --- a/libvpx/vpx_dsp/x86/mem_sse2.h +++ b/libvpx/vpx_dsp/x86/mem_sse2.h @@ -27,13 +27,13 @@ static INLINE int32_t loadu_int32(const void *src) { } static INLINE __m128i load_unaligned_u32(const void *a) { - uint32_t val; + int val; memcpy(&val, a, sizeof(val)); return _mm_cvtsi32_si128(val); } static INLINE void store_unaligned_u32(void *const a, const __m128i v) { - const uint32_t val = _mm_cvtsi128_si32(v); + const int val = _mm_cvtsi128_si32(v); memcpy(a, &val, sizeof(val)); } diff --git a/libvpx/vpx_dsp/x86/post_proc_sse2.c b/libvpx/vpx_dsp/x86/post_proc_sse2.c index d1029afc4..119fa7cd1 100644 --- a/libvpx/vpx_dsp/x86/post_proc_sse2.c +++ b/libvpx/vpx_dsp/x86/post_proc_sse2.c @@ -36,7 +36,7 @@ void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, __m128i s = _mm_loadl_epi64((__m128i *)dst); __m128i sum, sumsq_0, sumsq_1; __m128i tmp_0, tmp_1; - __m128i below_context; + __m128i below_context = _mm_setzero_si128(); s = _mm_unpacklo_epi8(s, zero); diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c index 706e4e641..7d8352721 100644 --- a/libvpx/vpx_dsp/x86/quantize_avx.c +++ b/libvpx/vpx_dsp/x86/quantize_avx.c @@ -93,8 +93,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -134,8 +133,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } @@ -229,8 +227,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -272,8 +269,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/libvpx/vpx_dsp/x86/quantize_avx2.c b/libvpx/vpx_dsp/x86/quantize_avx2.c new file mode 100644 index 000000000..28f7c9c7d --- /dev/null +++ b/libvpx/vpx_dsp/x86/quantize_avx2.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void load_b_values_avx2( + const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, + __m256i *round, const int16_t *quant_ptr, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, + __m256i *shift, int log_scale) { + *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *zbin = _mm256_add_epi16(*zbin, rnd); + *zbin = _mm256_srai_epi16(*zbin, log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) + *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); + + *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *round = _mm256_add_epi16(*round, rnd); + *round = _mm256_srai_epi16(*round, log_scale); + } + + *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); + *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); + *shift = _mm256_permute4x64_epi64(*shift, 0x54); +} + +static VPX_FORCE_INLINE __m256i +load_coefficients_avx2(const tran_low_t *coeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +#else + // typedef int16_t tran_low_t; + return _mm256_loadu_si256((const __m256i *)coeff_ptr); +#endif +} + +static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo); + _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +#else + // typedef int16_t tran_low_t; + _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals); +#endif +} + +static VPX_FORCE_INLINE __m256i +quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, + __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); +#endif // CONFIG_VP9_HIGHBITDEPTH + return _mm256_setzero_si256(); + } + { + // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift); + const __m256i v_nz_mask = + _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant); + const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant); + + const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high); + const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high); +#else + const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant); +#endif + + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); +#else + store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); +#endif + return v_nz_mask; + } +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); +#else + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); +#endif + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { + const __m128i eob_lo = _mm256_castsi256_si128(eob256); + const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); + __m128i eob = _mm_max_epi16(eob_lo, eob_hi); + __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; + __m256i v_eobmax = _mm256_setzero_si256(); + intptr_t count; + (void)scan; + + load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, + &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, + &v_quant_shift, 0); + // Do DC and first 15 AC. + v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (count = n_coeffs - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} + +static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant, + __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin, + __m256i *v_quant_shift, __m256i *v_eobmax) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); +#endif + return *v_eobmax; + } + { + // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + // quant_shift_ptr[rc != 0]) >> 15); + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32_hi = + _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1); + const __m256i v_tmp32_lo = + _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15); + const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); + const __m256i v_sign_lo = + _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff); + const __m256i v_sign_hi = + _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff); + const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant); + const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant); + const __m256i v_dqcoeff_lo = _mm256_sign_epi32( + _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo); + const __m256i v_dqcoeff_hi = _mm256_sign_epi32( + _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi); + const __m256i v_nz_mask = + _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); + +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); +#else + store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi), + dqcoeff_ptr); +#endif + + return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask); + } +} + +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; + __m256i v_eobmax = _mm256_setzero_si256(); + intptr_t count; + (void)n_coeffs; + (void)scan; + + load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, + &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, + &v_quant_shift, 1); + + // Do DC and first 15 AC. + v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, &v_eobmax); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (count = (32 * 32) - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, &v_eobmax); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c index 459d95f28..9533e7916 100644 --- a/libvpx/vpx_dsp/x86/quantize_sse2.c +++ b/libvpx/vpx_dsp/x86/quantize_sse2.c @@ -76,7 +76,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -106,8 +106,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); index += 16; diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.h b/libvpx/vpx_dsp/x86/quantize_sse2.h index afe2f924b..27bfb4e41 100644 --- a/libvpx/vpx_dsp/x86/quantize_sse2.h +++ b/libvpx/vpx_dsp/x86/quantize_sse2.h @@ -29,6 +29,15 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, *shift = _mm_load_si128((const __m128i *)shift_ptr); } +static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, + __m128i *dequant) { + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); +} + // With ssse3 and later abs() and sign() are preferred. static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { a = _mm_xor_si128(a, sign); @@ -62,11 +71,8 @@ static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, #endif // CONFIG_VP9_HIGHBITDEPTH } -// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to -// zbin to add 1 to the index in 'scan'. +// Scan 16 values for eob reference in scan. static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, - const __m128i zbin_mask0, - const __m128i zbin_mask1, const int16_t *scan, const int index, const __m128i zero) { const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); @@ -74,9 +80,6 @@ static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index)); __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8)); __m128i eob0, eob1; - // Add one to convert from indices to counts - scan0 = _mm_sub_epi16(scan0, zbin_mask0); - scan1 = _mm_sub_epi16(scan1, zbin_mask1); eob0 = _mm_andnot_si128(zero_coeff0, scan0); eob1 = _mm_andnot_si128(zero_coeff1, scan1); return _mm_max_epi16(eob0, eob1); diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c index 9d2a88b7b..476230286 100644 --- a/libvpx/vpx_dsp/x86/quantize_ssse3.c +++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c @@ -70,7 +70,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -98,8 +98,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); index += 16; @@ -202,8 +201,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -249,8 +247,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8 + index); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/libvpx/vpx_dsp/x86/sad_avx2.c b/libvpx/vpx_dsp/x86/sad_avx2.c index 3b48acd51..29bedb0e6 100644 --- a/libvpx/vpx_dsp/x86/sad_avx2.c +++ b/libvpx/vpx_dsp/x86/sad_avx2.c @@ -14,7 +14,7 @@ #define FSAD64_H(h) \ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -35,8 +35,7 @@ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } #define FSAD32_H(h) \ @@ -92,7 +91,7 @@ FSAD32 unsigned int vpx_sad64x##h##_avg_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -118,15 +117,14 @@ FSAD32 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } #define FSADAVG32_H(h) \ unsigned int vpx_sad32x##h##_avg_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -156,8 +154,7 @@ FSAD32 sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } #define FSADAVG64 \ diff --git a/libvpx/vpx_dsp/x86/subtract_avx2.c b/libvpx/vpx_dsp/x86/subtract_avx2.c new file mode 100644 index 000000000..4849581ed --- /dev/null +++ b/libvpx/vpx_dsp/x86/subtract_avx2.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr, + const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); + const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); + const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); + const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); + _mm256_storeu_si256((__m256i *)diff_ptr, d_0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1); +} + +static VPX_FORCE_INLINE void subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr); + const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr); + const __m256i s_0 = _mm256_cvtepu8_epi16(s); + const __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_storeu_si256((__m256i *)diff_ptr, d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static VPX_FORCE_INLINE void subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static VPX_FORCE_INLINE void subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 32: + subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 64: + subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + default: + vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, + const uint8_t *src8_ptr, + ptrdiff_t src_stride, + const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + if (cols == 64) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); + const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32)); + const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); + const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32)); + const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + const __m256i d2 = _mm256_sub_epi16(s2, p2); + const __m256i d3 = _mm256_sub_epi16(s3, p3); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); + _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2); + _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } while (--j != 0); + } else if (cols == 32) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } while (--j != 0); + } else if (cols == 16) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = + _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = + _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } else if (cols == 8) { + int j = rows; + do { + const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr); + const __m128i s1 = + _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride)); + const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr); + const __m128i p1 = + _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride)); + const __m128i d0 = _mm_sub_epi16(s0, p0); + const __m128i d1 = _mm_sub_epi16(s1, p1); + _mm_storeu_si128((__m128i *)diff_ptr, d0); + _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } else { + int j = rows; + assert(cols == 4); + do { + const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i s1 = + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr); + const __m128i p1 = + _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride)); + const __m128i d0 = _mm_sub_epi16(s0, p0); + const __m128i d1 = _mm_sub_epi16(s1, p1); + _mm_storel_epi64((__m128i *)diff_ptr, d0); + _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/libvpx/vpx_dsp/x86/sum_squares_sse2.c index 14f3b35c0..df6514b2c 100644 --- a/libvpx/vpx_dsp/x86/sum_squares_sse2.c +++ b/libvpx/vpx_dsp/x86/sum_squares_sse2.c @@ -33,7 +33,7 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { } else { // Generic case int r = size; - const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1); __m128i v_acc_q = _mm_setzero_si128(); assert(size % 8 == 0); diff --git a/libvpx/vpx_dsp/x86/variance_avx2.c b/libvpx/vpx_dsp/x86/variance_avx2.c index 9232acbfb..35925d590 100644 --- a/libvpx/vpx_dsp/x86/variance_avx2.c +++ b/libvpx/vpx_dsp/x86/variance_avx2.c @@ -590,17 +590,20 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, return sum; } -static unsigned int sub_pixel_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, int height, unsigned int *sse) { +static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, NULL, 0, 0, height, sse); } -static unsigned int sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *second_pred, - int second_stride, int height, unsigned int *sse) { +static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, + int second_stride, int height, + unsigned int *sse) { return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, second_pred, second_stride, 1, height, sse); } diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c index a67c92aad..d6eb12da1 100644 --- a/libvpx/vpx_dsp/x86/variance_sse2.c +++ b/libvpx/vpx_dsp/x86/variance_sse2.c @@ -19,7 +19,7 @@ static INLINE unsigned int add32x4_sse2(__m128i val) { val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); - return _mm_cvtsi128_si32(val); + return (unsigned int)_mm_cvtsi128_si32(val); } unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) { @@ -85,7 +85,7 @@ static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_unpacklo_epi16(vsum, vsum); vsum = _mm_srai_epi32(vsum, 16); - *sum = add32x4_sse2(vsum); + *sum = (int)add32x4_sse2(vsum); } static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { @@ -97,7 +97,7 @@ static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { // Can handle 1024 pixels' diff sum (such as 32x32) static INLINE int sum_final_sse2(const __m128i sum) { const __m128i t = sum_to_32bit_sse2(sum); - return add32x4_sse2(t); + return (int)add32x4_sse2(t); } static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride, @@ -349,7 +349,7 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); - sum = add32x4_sse2(vsum); + sum = (int)add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } @@ -369,7 +369,7 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); - sum = add32x4_sse2(vsum); + sum = (int)add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } @@ -389,7 +389,7 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); - sum = add32x4_sse2(vsum); + sum = (int)add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); } diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index 0cbd151dc..21a35ae3c 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -485,7 +485,7 @@ static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, // Saturate and convert to 8-bit words dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); src_ptr += src_stride; dst_ptr += dst_stride; @@ -589,8 +589,8 @@ static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero); // Save only half of the register (8 words) - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012); - *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012); + *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123); // Update the source by two rows src_ptr += src_stride_unrolled; diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 6f2983a4b..c7d880860 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -227,6 +227,9 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]); } + // The output_height is always a multiple of two. + assert(!(output_height & 1)); + for (i = output_height; i > 1; i -= 2) { __m256i srcRegHead2, srcRegHead3; @@ -282,35 +285,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( s2[2] = s2[3]; srcRegHead1 = srcRegHead3; } - - // if the number of strides is odd. - // process only 16 bytes - if (i > 0) { - // load the last 16 bytes - const __m128i srcRegHead2 = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - - // merge the last 2 results together - s1[0] = _mm256_castsi128_si256( - _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); - s2[0] = _mm256_castsi128_si256( - _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2)); - - outReg1 = convolve8_8_avx2(s1, f); - outReg2 = convolve8_8_avx2(s2, f); - - // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane - // contain the first and second convolve result respectively - outReg1 = _mm_packus_epi16(outReg1, outReg2); - - // average if necessary - if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - } - - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - } } static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, @@ -798,7 +772,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, // Pack to 8-bits dst = _mm_packus_epi16(dst, _mm_setzero_si128()); - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst); } } diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index ed46d6245..4ea2752d3 100644 --- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -580,7 +580,7 @@ static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr, // Pack to 8-bits dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); src_ptr += src_stride; dst_ptr += dst_stride; @@ -666,8 +666,8 @@ static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, reg_1 = _mm_packus_epi16(reg_1, reg_1); // Save the result - *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0); - *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0); + *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1); // Update the source by two rows src_ptr += src_stride_unrolled; diff --git a/libvpx/vpx_ports/compiler_attributes.h b/libvpx/vpx_ports/compiler_attributes.h index 354352016..4b468749b 100644 --- a/libvpx/vpx_ports/compiler_attributes.h +++ b/libvpx/vpx_ports/compiler_attributes.h @@ -29,13 +29,23 @@ #endif // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) #if defined(__clang__) && __has_attribute(no_sanitize) +// Both of these have defined behavior and are used in certain operations or +// optimizations thereof. There are cases where an overflow may be unintended, +// however, so use of these attributes should be done with care. #define VPX_NO_UNSIGNED_OVERFLOW_CHECK \ __attribute__((no_sanitize("unsigned-integer-overflow"))) -#endif +#if __clang_major__ >= 12 +#define VPX_NO_UNSIGNED_SHIFT_CHECK \ + __attribute__((no_sanitize("unsigned-shift-base"))) +#endif // __clang__ >= 12 +#endif // __clang__ #ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK #define VPX_NO_UNSIGNED_OVERFLOW_CHECK #endif +#ifndef VPX_NO_UNSIGNED_SHIFT_CHECK +#define VPX_NO_UNSIGNED_SHIFT_CHECK +#endif //------------------------------------------------------------------------------ // Variable attributes. diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c index 7eff97b13..61672acad 100644 --- a/libvpx/vpxenc.c +++ b/libvpx/vpxenc.c @@ -524,9 +524,12 @@ static const arg_def_t row_mt = static const arg_def_t disable_loopfilter = ARG_DEF(NULL, "disable-loopfilter", 1, - "Control Loopfilter in VP9\n" + "Control Loopfilter in VP9:\n" + " " "0: Loopfilter on for all frames (default)\n" + " " "1: Loopfilter off for non reference frames\n" + " " "2: Loopfilter off for all frames"); #endif diff --git a/libvpx/webmdec.h b/libvpx/webmdec.h index d8618b07d..6ae7ee16d 100644 --- a/libvpx/webmdec.h +++ b/libvpx/webmdec.h @@ -27,7 +27,7 @@ struct WebmInputContext { const void *block; int block_frame_index; int video_track_index; - uint64_t timestamp_ns; + int64_t timestamp_ns; int is_key_frame; int reached_eos; }; diff --git a/libvpx/y4menc.c b/libvpx/y4menc.c index 02b729e5b..187798127 100644 --- a/libvpx/y4menc.c +++ b/libvpx/y4menc.c @@ -17,39 +17,34 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height, const char *color; switch (bit_depth) { case 8: - color = fmt == VPX_IMG_FMT_I444 - ? "C444\n" - : fmt == VPX_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n"; + color = fmt == VPX_IMG_FMT_I444 ? "C444\n" + : fmt == VPX_IMG_FMT_I422 ? "C422\n" + : "C420jpeg\n"; break; case 9: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p9 XYSCSS=444P9\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" - : "C420p9 XYSCSS=420P9\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" + : "C420p9 XYSCSS=420P9\n"; break; case 10: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p10 XYSCSS=444P10\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" - : "C420p10 XYSCSS=420P10\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" + : "C420p10 XYSCSS=420P10\n"; break; case 12: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p12 XYSCSS=444P12\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" - : "C420p12 XYSCSS=420P12\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" + : "C420p12 XYSCSS=420P12\n"; break; case 14: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p14 XYSCSS=444P14\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" - : "C420p14 XYSCSS=420P14\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" + : "C420p14 XYSCSS=420P14\n"; break; case 16: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p16 XYSCSS=444P16\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" - : "C420p16 XYSCSS=420P16\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" + : "C420p16 XYSCSS=420P16\n"; break; default: color = NULL; assert(0); } diff --git a/libvpx/y4minput.c b/libvpx/y4minput.c index 7d3c03a7f..745e2f1cd 100644 --- a/libvpx/y4minput.c +++ b/libvpx/y4minput.c @@ -21,12 +21,13 @@ // Reads 'size' bytes from 'file' into 'buf' with some fault tolerance. // Returns true on success. static int file_read(void *buf, size_t size, FILE *file) { - const int kMaxRetries = 5; - int retry_count = 0; - int file_error; + const int kMaxTries = 5; + int try_count = 0; + int file_error = 0; size_t len = 0; - do { + while (!feof(file) && len < size && try_count < kMaxTries) { const size_t n = fread((uint8_t *)buf + len, 1, size - len, file); + ++try_count; len += n; file_error = ferror(file); if (file_error) { @@ -39,13 +40,13 @@ static int file_read(void *buf, size_t size, FILE *file) { return 0; } } - } while (!feof(file) && len < size && ++retry_count < kMaxRetries); + } if (!feof(file) && len != size) { fprintf(stderr, "Error reading file: %u of %u bytes read," - " error: %d, retries: %d, %d: %s\n", - (uint32_t)len, (uint32_t)size, file_error, retry_count, errno, + " error: %d, tries: %d, %d: %s\n", + (uint32_t)len, (uint32_t)size, file_error, try_count, errno, strerror(errno)); } return len == size; diff --git a/libwebm/mkvparser/mkvparser.cc b/libwebm/mkvparser/mkvparser.cc index 412e6a52c..868afcb3e 100644 --- a/libwebm/mkvparser/mkvparser.cc +++ b/libwebm/mkvparser/mkvparser.cc @@ -54,9 +54,9 @@ Type* SafeArrayAlloc(unsigned long long num_elements, void GetVersion(int& major, int& minor, int& build, int& revision) { major = 1; - minor = 0; - build = 0; - revision = 30; + minor = 1; + build = 1; + revision = 0; } long long ReadUInt(IMkvReader* pReader, long long pos, long& len) { @@ -298,7 +298,7 @@ long UnserializeInt(IMkvReader* pReader, long long pos, long long size, if (status < 0) return status; - unsigned long long result = first_byte; + unsigned long long result = static_cast<unsigned long long>(first_byte); ++pos; for (long i = 1; i < size; ++i) { @@ -1502,8 +1502,8 @@ long SeekHead::Parse() { // first count the seek head entries - int entry_count = 0; - int void_element_count = 0; + long long entry_count = 0; + long long void_element_count = 0; while (pos < stop) { long long id, size; @@ -1513,10 +1513,15 @@ long SeekHead::Parse() { if (status < 0) // error return status; - if (id == libwebm::kMkvSeek) + if (id == libwebm::kMkvSeek) { ++entry_count; - else if (id == libwebm::kMkvVoid) + if (entry_count > INT_MAX) + return E_PARSE_FAILED; + } else if (id == libwebm::kMkvVoid) { ++void_element_count; + if (void_element_count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload @@ -1528,14 +1533,15 @@ long SeekHead::Parse() { return E_FILE_FORMAT_INVALID; if (entry_count > 0) { - m_entries = new (std::nothrow) Entry[entry_count]; + m_entries = new (std::nothrow) Entry[static_cast<size_t>(entry_count)]; if (m_entries == NULL) return -1; } if (void_element_count > 0) { - m_void_elements = new (std::nothrow) VoidElement[void_element_count]; + m_void_elements = + new (std::nothrow) VoidElement[static_cast<size_t>(void_element_count)]; if (m_void_elements == NULL) return -1; @@ -1582,13 +1588,13 @@ long SeekHead::Parse() { ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries); assert(count_ >= 0); - assert(count_ <= entry_count); + assert(static_cast<long long>(count_) <= entry_count); m_entry_count = static_cast<int>(count_); count_ = ptrdiff_t(pVoidElement - m_void_elements); assert(count_ >= 0); - assert(count_ <= void_element_count); + assert(static_cast<long long>(count_) <= void_element_count); m_void_element_count = static_cast<int>(count_); @@ -2299,7 +2305,7 @@ bool CuePoint::Load(IMkvReader* pReader) { long long pos = pos_; // First count number of track positions - + unsigned long long track_positions_count = 0; while (pos < stop) { long len; @@ -2323,12 +2329,17 @@ bool CuePoint::Load(IMkvReader* pReader) { if (id == libwebm::kMkvCueTime) m_timecode = UnserializeUInt(pReader, pos, size); - else if (id == libwebm::kMkvCueTrackPositions) - ++m_track_positions_count; + else if (id == libwebm::kMkvCueTrackPositions) { + ++track_positions_count; + if (track_positions_count > UINT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload } + m_track_positions_count = static_cast<size_t>(track_positions_count); + if (m_timecode < 0 || m_track_positions_count <= 0) { return false; } @@ -2421,7 +2432,7 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_, pos += size; // consume payload } - if ((m_pos < 0) || (m_track <= 0)) { + if ((m_pos < 0) || (m_track <= 0) || (m_block < 0) || (m_block > LONG_MAX)) { return false; } @@ -4194,8 +4205,8 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, const long long stop = start + size; // Count ContentCompression and ContentEncryption elements. - int compression_count = 0; - int encryption_count = 0; + long long compression_count = 0; + long long encryption_count = 0; while (pos < stop) { long long id, size; @@ -4203,11 +4214,17 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, if (status < 0) // error return status; - if (id == libwebm::kMkvContentCompression) + if (id == libwebm::kMkvContentCompression) { ++compression_count; + if (compression_count > INT_MAX) + return E_PARSE_FAILED; + } - if (id == libwebm::kMkvContentEncryption) + if (id == libwebm::kMkvContentEncryption) { ++encryption_count; + if (encryption_count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload if (pos > stop) @@ -4218,16 +4235,16 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, return -1; if (compression_count > 0) { - compression_entries_ = - new (std::nothrow) ContentCompression*[compression_count]; + compression_entries_ = new (std::nothrow) + ContentCompression*[static_cast<size_t>(compression_count)]; if (!compression_entries_) return -1; compression_entries_end_ = compression_entries_; } if (encryption_count > 0) { - encryption_entries_ = - new (std::nothrow) ContentEncryption*[encryption_count]; + encryption_entries_ = new (std::nothrow) + ContentEncryption*[static_cast<size_t>(encryption_count)]; if (!encryption_entries_) { delete[] compression_entries_; compression_entries_ = NULL; @@ -4918,7 +4935,7 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) { const long long stop = start + size; // Count ContentEncoding elements. - int count = 0; + long long count = 0; while (pos < stop) { long long id, size; const long status = ParseElementHeader(pReader, pos, stop, id, size); @@ -4926,8 +4943,11 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) { return status; // pos now designates start of element - if (id == libwebm::kMkvContentEncoding) + if (id == libwebm::kMkvContentEncoding) { ++count; + if (count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload if (pos > stop) @@ -4937,7 +4957,8 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) { if (count <= 0) return -1; - content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count]; + content_encoding_entries_ = + new (std::nothrow) ContentEncoding*[static_cast<size_t>(count)]; if (!content_encoding_entries_) return -1; @@ -5653,7 +5674,7 @@ long Tracks::Parse() { const long long stop = m_start + m_size; IMkvReader* const pReader = m_pSegment->m_pReader; - int count = 0; + long long count = 0; long long pos = m_start; while (pos < stop) { @@ -5667,8 +5688,11 @@ long Tracks::Parse() { if (size == 0) // weird continue; - if (id == libwebm::kMkvTrackEntry) + if (id == libwebm::kMkvTrackEntry) { ++count; + if (count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload if (pos > stop) @@ -5681,7 +5705,7 @@ long Tracks::Parse() { if (count <= 0) return 0; // success - m_trackEntries = new (std::nothrow) Track*[count]; + m_trackEntries = new (std::nothrow) Track*[static_cast<size_t>(count)]; if (m_trackEntries == NULL) return -1; |