diff options
-rw-r--r-- | bench/s16-vlshift.cc | 2 | ||||
-rw-r--r-- | src/s16-vlshift/gen/neon-x16.c | 10 | ||||
-rw-r--r-- | src/s16-vlshift/gen/neon-x24.c | 10 | ||||
-rw-r--r-- | src/s16-vlshift/gen/neon-x32.c | 10 | ||||
-rw-r--r-- | src/s16-vlshift/gen/neon-x8.c | 10 | ||||
-rw-r--r-- | src/s16-vlshift/gen/scalar-x1.c | 9 | ||||
-rw-r--r-- | src/s16-vlshift/gen/scalar-x2.c | 9 | ||||
-rw-r--r-- | src/s16-vlshift/gen/scalar-x3.c | 9 | ||||
-rw-r--r-- | src/s16-vlshift/gen/scalar-x4.c | 9 | ||||
-rw-r--r-- | src/s16-vlshift/neon.c.in | 10 | ||||
-rw-r--r-- | src/s16-vlshift/scalar.c.in | 9 | ||||
-rw-r--r-- | src/xnnpack/microfnptr.h | 6 | ||||
-rw-r--r-- | src/xnnpack/vlshift.h | 6 | ||||
-rw-r--r-- | test/vlshift-microkernel-tester.h | 2 |
14 files changed, 53 insertions, 58 deletions
diff --git a/bench/s16-vlshift.cc b/bench/s16-vlshift.cc index 3d3f23f37..749b2ee30 100644 --- a/bench/s16-vlshift.cc +++ b/bench/s16-vlshift.cc @@ -36,7 +36,7 @@ void vlshift( std::iota(output.begin(), output.end(), 1); for (auto _ : state) { - vlshift(batch, input.data(), uint32_t(4), output.data()); + vlshift(batch, input.data(), output.data(), 4 /* shift */); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); diff --git a/src/s16-vlshift/gen/neon-x16.c b/src/s16-vlshift/gen/neon-x16.c index a2b3d9c83..2e2ca6a16 100644 --- a/src/s16-vlshift/gen/neon-x16.c +++ b/src/s16-vlshift/gen/neon-x16.c @@ -20,15 +20,15 @@ void xnn_s16_vlshift_ukernel__neon_x16( size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output) { - + int16_t* output, + uint32_t shift) +{ assert(batch > 0); assert(input != NULL); - assert(shift < 16); assert(output != NULL); + assert(shift < 16); - const int16x8_t vshift = vdupq_n_s16(shift); + const int16x8_t vshift = vdupq_n_s16((int16_t) shift); for (; batch >= 16; batch -= 16) { const int16x8_t vi0 = vld1q_s16(input); input += 8; diff --git a/src/s16-vlshift/gen/neon-x24.c b/src/s16-vlshift/gen/neon-x24.c index 91f8cb690..1e66c8f61 100644 --- a/src/s16-vlshift/gen/neon-x24.c +++ b/src/s16-vlshift/gen/neon-x24.c @@ -20,15 +20,15 @@ void xnn_s16_vlshift_ukernel__neon_x24( size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output) { - + int16_t* output, + uint32_t shift) +{ assert(batch > 0); assert(input != NULL); - assert(shift < 16); assert(output != NULL); + assert(shift < 16); - const int16x8_t vshift = vdupq_n_s16(shift); + const int16x8_t vshift = vdupq_n_s16((int16_t) shift); for (; batch >= 24; batch -= 24) { const int16x8_t vi0 = vld1q_s16(input); input += 8; diff --git a/src/s16-vlshift/gen/neon-x32.c b/src/s16-vlshift/gen/neon-x32.c index 2c50a9067..0cd66636a 100644 --- a/src/s16-vlshift/gen/neon-x32.c +++ b/src/s16-vlshift/gen/neon-x32.c @@ -20,15 +20,15 @@ void xnn_s16_vlshift_ukernel__neon_x32( size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output) { - + int16_t* output, + uint32_t shift) +{ assert(batch > 0); assert(input != NULL); - assert(shift < 16); assert(output != NULL); + assert(shift < 16); - const int16x8_t vshift = vdupq_n_s16(shift); + const int16x8_t vshift = vdupq_n_s16((int16_t) shift); for (; batch >= 32; batch -= 32) { const int16x8_t vi0 = vld1q_s16(input); input += 8; diff --git a/src/s16-vlshift/gen/neon-x8.c b/src/s16-vlshift/gen/neon-x8.c index d930b9866..62290c910 100644 --- a/src/s16-vlshift/gen/neon-x8.c +++ b/src/s16-vlshift/gen/neon-x8.c @@ -20,15 +20,15 @@ void xnn_s16_vlshift_ukernel__neon_x8( size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output) { - + int16_t* output, + uint32_t shift) +{ assert(batch > 0); assert(input != NULL); - assert(shift < 16); assert(output != NULL); + assert(shift < 16); - const int16x8_t vshift = vdupq_n_s16(shift); + const int16x8_t vshift = vdupq_n_s16((int16_t) shift); // Remainder of full vectors diff --git a/src/s16-vlshift/gen/scalar-x1.c b/src/s16-vlshift/gen/scalar-x1.c index bb40fe22f..fbc008be0 100644 --- a/src/s16-vlshift/gen/scalar-x1.c +++ b/src/s16-vlshift/gen/scalar-x1.c @@ -18,14 +18,13 @@ void xnn_s16_vlshift_ukernel__scalar_x1( size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output) { - + int16_t* output, + uint32_t shift) +{ assert(batch != 0); assert(input != NULL); - assert(shift < 16); assert(output != NULL); - + assert(shift < 16); if XNN_UNLIKELY(batch != 0) { do { diff --git a/src/s16-vlshift/gen/scalar-x2.c b/src/s16-vlshift/gen/scalar-x2.c index 13ffac72f..d9052d7bb 100644 --- a/src/s16-vlshift/gen/scalar-x2.c +++ b/src/s16-vlshift/gen/scalar-x2.c @@ -18,13 +18,13 @@ void xnn_s16_vlshift_ukernel__scalar_x2( size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output) { - + int16_t* output, + uint32_t shift) +{ assert(batch != 0); assert(input != NULL); - assert(shift < 16); assert(output != NULL); + assert(shift < 16); for (; batch >= 2; batch -= 2) { const uint16_t vi0 = (uint16_t) input[0]; @@ -38,7 +38,6 @@ void xnn_s16_vlshift_ukernel__scalar_x2( output[1] = (int16_t) vout1; output += 2; } - if XNN_UNLIKELY(batch != 0) { do { const uint16_t vi = (uint16_t) *input++; diff --git a/src/s16-vlshift/gen/scalar-x3.c b/src/s16-vlshift/gen/scalar-x3.c index 68b729304..55ccc626c 100644 --- a/src/s16-vlshift/gen/scalar-x3.c +++ b/src/s16-vlshift/gen/scalar-x3.c @@ -18,13 +18,13 @@ void xnn_s16_vlshift_ukernel__scalar_x3( size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output) { - + int16_t* output, + uint32_t shift) +{ assert(batch != 0); assert(input != NULL); - assert(shift < 16); assert(output != NULL); + assert(shift < 16); for (; batch >= 3; batch -= 3) { const uint16_t vi0 = (uint16_t) input[0]; @@ -41,7 +41,6 @@ void xnn_s16_vlshift_ukernel__scalar_x3( output[2] = (int16_t) vout2; output += 3; } - if XNN_UNLIKELY(batch != 0) { do { const uint16_t vi = (uint16_t) *input++; diff --git a/src/s16-vlshift/gen/scalar-x4.c b/src/s16-vlshift/gen/scalar-x4.c index 3e725b8e0..326c7b01c 100644 --- a/src/s16-vlshift/gen/scalar-x4.c +++ b/src/s16-vlshift/gen/scalar-x4.c @@ -18,13 +18,13 @@ void xnn_s16_vlshift_ukernel__scalar_x4( size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output) { - + int16_t* output, + uint32_t shift) +{ assert(batch != 0); assert(input != NULL); - assert(shift < 16); assert(output != NULL); + assert(shift < 16); for (; batch >= 4; batch -= 4) { const uint16_t vi0 = (uint16_t) input[0]; @@ -44,7 +44,6 @@ void xnn_s16_vlshift_ukernel__scalar_x4( output[3] = (int16_t) vout3; output += 4; } - if XNN_UNLIKELY(batch != 0) { do { const uint16_t vi = (uint16_t) *input++; diff --git a/src/s16-vlshift/neon.c.in b/src/s16-vlshift/neon.c.in index 46513cd07..cec559653 100644 --- a/src/s16-vlshift/neon.c.in +++ b/src/s16-vlshift/neon.c.in @@ -19,15 +19,15 @@ $SIMD_TILE = BATCH_TILE // 8 void xnn_s16_vlshift_ukernel__neon_x${BATCH_TILE}( size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output) { - + int16_t* output, + uint32_t shift) +{ assert(batch > 0); assert(input != NULL); - assert(shift < 16); assert(output != NULL); + assert(shift < 16); - const int16x8_t vshift = vdupq_n_s16(shift); + const int16x8_t vshift = vdupq_n_s16((int16_t) shift); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE}; batch -= ${BATCH_TILE}) { diff --git a/src/s16-vlshift/scalar.c.in b/src/s16-vlshift/scalar.c.in index 1f519380c..e59918d20 100644 --- a/src/s16-vlshift/scalar.c.in +++ b/src/s16-vlshift/scalar.c.in @@ -15,13 +15,13 @@ $assert BATCH_TILE >= 1 void xnn_s16_vlshift_ukernel__scalar_x${BATCH_TILE}( size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output) { - + int16_t* output, + uint32_t shift) +{ assert(batch != 0); assert(input != NULL); - assert(shift < 16); assert(output != NULL); + assert(shift < 16); $if BATCH_TILE > 1: for (; batch >= ${BATCH_TILE}; batch -= ${BATCH_TILE}) { @@ -36,7 +36,6 @@ void xnn_s16_vlshift_ukernel__scalar_x${BATCH_TILE}( output[${C}] = (int16_t) vout${C}; output += ${BATCH_TILE}; } - if XNN_UNLIKELY(batch != 0) { do { const uint16_t vi = (uint16_t) *input++; diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index 6499eba53..feeb633d8 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -1656,10 +1656,10 @@ typedef void (*xnn_u32_filterbank_subtract_ukernel_function)( uint32_t* output); typedef void (*xnn_s16_vlshift_ukernel_function)( - size_t batch_size, + size_t batch, const int16_t* input, - uint32_t shift, - int16_t* output); + int16_t* output, + uint32_t shift); typedef void (*xnn_cs16_vsquareabs_ukernel_function)( size_t batch_size, diff --git a/src/xnnpack/vlshift.h b/src/xnnpack/vlshift.h index 31f3be488..cba428f67 100644 --- a/src/xnnpack/vlshift.h +++ b/src/xnnpack/vlshift.h @@ -17,10 +17,10 @@ extern "C" { #define DECLARE_S16_VLSHIFT_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ - size_t batch_size, \ + size_t batch, \ const int16_t* input, \ - uint32_t shift, \ - int16_t* output); + int16_t* output, \ + uint32_t shift); DECLARE_S16_VLSHIFT_UKERNEL_FUNCTION(xnn_s16_vlshift_ukernel__neon_x8) diff --git a/test/vlshift-microkernel-tester.h b/test/vlshift-microkernel-tester.h index 6a65d0d26..7772281b1 100644 --- a/test/vlshift-microkernel-tester.h +++ b/test/vlshift-microkernel-tester.h @@ -82,7 +82,7 @@ class VLShiftMicrokernelTester { } // Call optimized micro-kernel. - vlshift(batch(), x_data, shift(), y.data()); + vlshift(batch(), x_data, y.data(), shift()); // Verify results. for (size_t n = 0; n < batch(); n++) { |