14 files changed, 53 insertions, 58 deletions
diff --git a/bench/s16-vlshift.cc b/bench/s16-vlshift.cc
index 3d3f23f37..749b2ee30 100644
--- a/bench/s16-vlshift.cc
+++ b/bench/s16-vlshift.cc
@@ -36,7 +36,7 @@ void vlshift(
   std::iota(output.begin(), output.end(), 1);
 
   for (auto _ : state) {
-    vlshift(batch, input.data(), uint32_t(4), output.data());
+    vlshift(batch, input.data(), output.data(), 4 /* shift */);
   }
 
   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
diff --git a/src/s16-vlshift/gen/neon-x16.c b/src/s16-vlshift/gen/neon-x16.c
index a2b3d9c83..2e2ca6a16 100644
--- a/src/s16-vlshift/gen/neon-x16.c
+++ b/src/s16-vlshift/gen/neon-x16.c
@@ -20,15 +20,15 @@
 void xnn_s16_vlshift_ukernel__neon_x16(
     size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output) {
-
+    int16_t* output,
+    uint32_t shift)
+{
   assert(batch > 0);
   assert(input != NULL);
-  assert(shift < 16);
   assert(output != NULL);
+  assert(shift < 16);
 
-  const int16x8_t vshift = vdupq_n_s16(shift);
+  const int16x8_t vshift = vdupq_n_s16((int16_t) shift);
 
   for (; batch >= 16; batch -= 16) {
     const int16x8_t vi0 = vld1q_s16(input); input += 8;
diff --git a/src/s16-vlshift/gen/neon-x24.c b/src/s16-vlshift/gen/neon-x24.c
index 91f8cb690..1e66c8f61 100644
--- a/src/s16-vlshift/gen/neon-x24.c
+++ b/src/s16-vlshift/gen/neon-x24.c
@@ -20,15 +20,15 @@
 void xnn_s16_vlshift_ukernel__neon_x24(
     size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output) {
-
+    int16_t* output,
+    uint32_t shift)
+{
   assert(batch > 0);
   assert(input != NULL);
-  assert(shift < 16);
   assert(output != NULL);
+  assert(shift < 16);
 
-  const int16x8_t vshift = vdupq_n_s16(shift);
+  const int16x8_t vshift = vdupq_n_s16((int16_t) shift);
 
   for (; batch >= 24; batch -= 24) {
     const int16x8_t vi0 = vld1q_s16(input); input += 8;
diff --git a/src/s16-vlshift/gen/neon-x32.c b/src/s16-vlshift/gen/neon-x32.c
index 2c50a9067..0cd66636a 100644
--- a/src/s16-vlshift/gen/neon-x32.c
+++ b/src/s16-vlshift/gen/neon-x32.c
@@ -20,15 +20,15 @@
 void xnn_s16_vlshift_ukernel__neon_x32(
     size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output) {
-
+    int16_t* output,
+    uint32_t shift)
+{
   assert(batch > 0);
   assert(input != NULL);
-  assert(shift < 16);
   assert(output != NULL);
+  assert(shift < 16);
 
-  const int16x8_t vshift = vdupq_n_s16(shift);
+  const int16x8_t vshift = vdupq_n_s16((int16_t) shift);
 
   for (; batch >= 32; batch -= 32) {
     const int16x8_t vi0 = vld1q_s16(input); input += 8;
diff --git a/src/s16-vlshift/gen/neon-x8.c b/src/s16-vlshift/gen/neon-x8.c
index d930b9866..62290c910 100644
--- a/src/s16-vlshift/gen/neon-x8.c
+++ b/src/s16-vlshift/gen/neon-x8.c
@@ -20,15 +20,15 @@
 void xnn_s16_vlshift_ukernel__neon_x8(
     size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output) {
-
+    int16_t* output,
+    uint32_t shift)
+{
   assert(batch > 0);
   assert(input != NULL);
-  assert(shift < 16);
   assert(output != NULL);
+  assert(shift < 16);
 
-  const int16x8_t vshift = vdupq_n_s16(shift);
+  const int16x8_t vshift = vdupq_n_s16((int16_t) shift);
 
 
   // Remainder of full vectors
diff --git a/src/s16-vlshift/gen/scalar-x1.c b/src/s16-vlshift/gen/scalar-x1.c
index bb40fe22f..fbc008be0 100644
--- a/src/s16-vlshift/gen/scalar-x1.c
+++ b/src/s16-vlshift/gen/scalar-x1.c
@@ -18,14 +18,13 @@
 void xnn_s16_vlshift_ukernel__scalar_x1(
     size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output) {
-
+    int16_t* output,
+    uint32_t shift)
+{
   assert(batch != 0);
   assert(input != NULL);
-  assert(shift < 16);
   assert(output != NULL);
-
+  assert(shift < 16);
 
  if XNN_UNLIKELY(batch != 0) {
    do {
diff --git a/src/s16-vlshift/gen/scalar-x2.c b/src/s16-vlshift/gen/scalar-x2.c
index 13ffac72f..d9052d7bb 100644
--- a/src/s16-vlshift/gen/scalar-x2.c
+++ b/src/s16-vlshift/gen/scalar-x2.c
@@ -18,13 +18,13 @@
 void xnn_s16_vlshift_ukernel__scalar_x2(
     size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output) {
-
+    int16_t* output,
+    uint32_t shift)
+{
   assert(batch != 0);
   assert(input != NULL);
-  assert(shift < 16);
   assert(output != NULL);
+  assert(shift < 16);
 
   for (; batch >= 2; batch -= 2) {
     const uint16_t vi0 = (uint16_t) input[0];
@@ -38,7 +38,6 @@ void xnn_s16_vlshift_ukernel__scalar_x2(
     output[1] = (int16_t) vout1;
     output += 2;
   }
-
  if XNN_UNLIKELY(batch != 0) {
    do {
      const uint16_t vi = (uint16_t) *input++;
diff --git a/src/s16-vlshift/gen/scalar-x3.c b/src/s16-vlshift/gen/scalar-x3.c
index 68b729304..55ccc626c 100644
--- a/src/s16-vlshift/gen/scalar-x3.c
+++ b/src/s16-vlshift/gen/scalar-x3.c
@@ -18,13 +18,13 @@
 void xnn_s16_vlshift_ukernel__scalar_x3(
     size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output) {
-
+    int16_t* output,
+    uint32_t shift)
+{
   assert(batch != 0);
   assert(input != NULL);
-  assert(shift < 16);
   assert(output != NULL);
+  assert(shift < 16);
 
   for (; batch >= 3; batch -= 3) {
     const uint16_t vi0 = (uint16_t) input[0];
@@ -41,7 +41,6 @@ void xnn_s16_vlshift_ukernel__scalar_x3(
     output[2] = (int16_t) vout2;
     output += 3;
   }
-
  if XNN_UNLIKELY(batch != 0) {
    do {
      const uint16_t vi = (uint16_t) *input++;
diff --git a/src/s16-vlshift/gen/scalar-x4.c b/src/s16-vlshift/gen/scalar-x4.c
index 3e725b8e0..326c7b01c 100644
--- a/src/s16-vlshift/gen/scalar-x4.c
+++ b/src/s16-vlshift/gen/scalar-x4.c
@@ -18,13 +18,13 @@
 void xnn_s16_vlshift_ukernel__scalar_x4(
     size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output) {
-
+    int16_t* output,
+    uint32_t shift)
+{
   assert(batch != 0);
   assert(input != NULL);
-  assert(shift < 16);
   assert(output != NULL);
+  assert(shift < 16);
 
   for (; batch >= 4; batch -= 4) {
     const uint16_t vi0 = (uint16_t) input[0];
@@ -44,7 +44,6 @@ void xnn_s16_vlshift_ukernel__scalar_x4(
     output[3] = (int16_t) vout3;
     output += 4;
   }
-
  if XNN_UNLIKELY(batch != 0) {
    do {
      const uint16_t vi = (uint16_t) *input++;
diff --git a/src/s16-vlshift/neon.c.in b/src/s16-vlshift/neon.c.in
index 46513cd07..cec559653 100644
--- a/src/s16-vlshift/neon.c.in
+++ b/src/s16-vlshift/neon.c.in
@@ -19,15 +19,15 @@ $SIMD_TILE = BATCH_TILE // 8
 void xnn_s16_vlshift_ukernel__neon_x${BATCH_TILE}(
     size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output) {
-
+    int16_t* output,
+    uint32_t shift)
+{
   assert(batch > 0);
   assert(input != NULL);
-  assert(shift < 16);
   assert(output != NULL);
+  assert(shift < 16);
 
-  const int16x8_t vshift = vdupq_n_s16(shift);
+  const int16x8_t vshift = vdupq_n_s16((int16_t) shift);
 
   $if BATCH_TILE > 8:
     for (; batch >= ${BATCH_TILE}; batch -= ${BATCH_TILE}) {
diff --git a/src/s16-vlshift/scalar.c.in b/src/s16-vlshift/scalar.c.in
index 1f519380c..e59918d20 100644
--- a/src/s16-vlshift/scalar.c.in
+++ b/src/s16-vlshift/scalar.c.in
@@ -15,13 +15,13 @@ $assert BATCH_TILE >= 1
 void xnn_s16_vlshift_ukernel__scalar_x${BATCH_TILE}(
     size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output) {
-
+    int16_t* output,
+    uint32_t shift)
+{
   assert(batch != 0);
   assert(input != NULL);
-  assert(shift < 16);
   assert(output != NULL);
+  assert(shift < 16);
 
   $if BATCH_TILE > 1:
     for (; batch >= ${BATCH_TILE}; batch -= ${BATCH_TILE}) {
@@ -36,7 +36,6 @@ void xnn_s16_vlshift_ukernel__scalar_x${BATCH_TILE}(
         output[${C}] = (int16_t) vout${C};
       output += ${BATCH_TILE};
     }
-
  if XNN_UNLIKELY(batch != 0) {
    do {
      const uint16_t vi = (uint16_t) *input++;
diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h
index 6499eba53..feeb633d8 100644
--- a/src/xnnpack/microfnptr.h
+++ b/src/xnnpack/microfnptr.h
@@ -1656,10 +1656,10 @@ typedef void (*xnn_u32_filterbank_subtract_ukernel_function)(
     uint32_t* output);
 
 typedef void (*xnn_s16_vlshift_ukernel_function)(
-    size_t batch_size,
+    size_t batch,
     const int16_t* input,
-    uint32_t shift,
-    int16_t* output);
+    int16_t* output,
+    uint32_t shift);
 
 typedef void (*xnn_cs16_vsquareabs_ukernel_function)(
     size_t batch_size,
diff --git a/src/xnnpack/vlshift.h b/src/xnnpack/vlshift.h
index 31f3be488..cba428f67 100644
--- a/src/xnnpack/vlshift.h
+++ b/src/xnnpack/vlshift.h
@@ -17,10 +17,10 @@ extern "C" {
 
 #define DECLARE_S16_VLSHIFT_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                          \
-    size_t batch_size,                                \
+    size_t batch,                                     \
     const int16_t* input,                             \
-    uint32_t shift,                                   \
-    int16_t* output);
+    int16_t* output,                                  \
+    uint32_t shift);
 
 
 DECLARE_S16_VLSHIFT_UKERNEL_FUNCTION(xnn_s16_vlshift_ukernel__neon_x8)
diff --git a/test/vlshift-microkernel-tester.h b/test/vlshift-microkernel-tester.h
index 6a65d0d26..7772281b1 100644
--- a/test/vlshift-microkernel-tester.h
+++ b/test/vlshift-microkernel-tester.h
@@ -82,7 +82,7 @@ class VLShiftMicrokernelTester {
       }
 
       // Call optimized micro-kernel.
-      vlshift(batch(), x_data, shift(), y.data());
+      vlshift(batch(), x_data, y.data(), shift());
 
       // Verify results.
       for (size_t n = 0; n < batch(); n++) {