improve handling at end of buffer

a prior change reduced iterations through the input buffer to avoid the NEON operations from overrunning the end of the locally allocated buffer. While avoiding the overrun, it generated bad results. Here we instead extend the locally allocated buffers enough that the original iteration count won't overrun. Some pre-existing bit-exact issues remain. Bug: 136616344 Test: CTS + bit-exact cross-checks.
author: Ray Essick <essick@google.com> 2019-09-04 09:09:52 -0700
committer: Ray Essick <essick@google.com> 2019-09-04 09:14:11 -0700
commit: aae866aed579da4e1c3299a1e9b94a1713a0decb (patch)
tree: d63135df0e611e34319dc4f50f04fa715a1a17bf
parent: c474f1ce1e71d6e05797832eb70e959067435edb (diff)
download: libopus-aae866aed579da4e1c3299a1e9b94a1713a0decb.tar.gz
1 files changed, 8 insertions, 3 deletions
diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
index ee06f986..6f3be025 100644
--- a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
+++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
@@ -84,7 +84,9 @@ void silk_warped_autocorrelation_FIX_neon(
         silk_assert( ( order & 1 ) == 0 );
         silk_assert( 2 * QS - QC >= 0 );
 
-        ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 );
+        /* The additional +4 is to ensure a later vld1q_s32 call does not overflow.               */
+        /* Strictly, only +3 is needed but +4 simplifies initialization using the 4x32 neon load. */
+        ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER + 4, opus_int32 );
 
         input_QS = input_QST;
         /* input_QS has zero paddings in the beginning and end. */
@@ -121,6 +123,8 @@ void silk_warped_autocorrelation_FIX_neon(
         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
         input_QS += 4;
         vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
+        input_QS += 4;
+        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
         input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT;
 
         /* The following loop runs ( length + order ) times, with ( order ) extra epilogues.                  */
@@ -153,7 +157,8 @@ void silk_warped_autocorrelation_FIX_neon(
             opus_int o = orderT;
             int32x4_t state_QS_s32x4[ 3 ][ 2 ];
 
-            ALLOC( state, length + orderT, opus_int32 );
+            /* The additional +4 is to ensure a later vld1q_s32 call does not overflow. */
+            ALLOC( state, length + order + 4, opus_int32 );
             state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 );
 
             /* Calculate 8 taps of all inputs in each loop. */
@@ -172,7 +177,7 @@ void silk_warped_autocorrelation_FIX_neon(
                     state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 );
                     state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
                     state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ];
-                } while( ++n < ( length + order - 3) );
+                } while( ++n < ( length + order ) );
                 in = state;
                 o -= 8;
             } while( o > 4 );
author	Ray Essick <essick@google.com>	2019-09-04 09:09:52 -0700
committer	Ray Essick <essick@google.com>	2019-09-04 09:14:11 -0700
commit	aae866aed579da4e1c3299a1e9b94a1713a0decb (patch)
tree	d63135df0e611e34319dc4f50f04fa715a1a17bf
parent	c474f1ce1e71d6e05797832eb70e959067435edb (diff)
download	libopus-aae866aed579da4e1c3299a1e9b94a1713a0decb.tar.gz