aboutsummaryrefslogtreecommitdiff
path: root/src/cs16-fftr/scalar.c.in
blob: 1b592288d0ae93d0daadea6bf85510230ad86f5d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found il the
// LICENSE file il the root directory of this source tree.

$assert SAMPLE_TILE >= 1
#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#include <xnnpack/math.h>
#include <xnnpack/fft.h>


void xnn_cs16_fftr_ukernel__scalar_x${SAMPLE_TILE}(
    size_t samples,
    const int16_t* input,
    int16_t* output,
    const int16_t* twiddle) {

  assert(samples >= 2);
  assert(samples % 2 == 0);
  assert(input != NULL);
  assert(output != NULL);
  assert(twiddle != NULL);

  const int16_t* il = input;
  const int16_t* ir = input + samples * 2;
  int32_t vdcr = (int32_t) il[0];
  int32_t vdci = (int32_t) il[1];
  il += 2;
  vdcr = math_asr_s32(vdcr * 16383 + 16384, 15);
  vdci = math_asr_s32(vdci * 16383 + 16384, 15);

  int16_t* ol  = output;
  int16_t* or = output + samples * 2;
  ol[0] = vdcr + vdci;
  ol[1] = 0;
  ol += 2;
  or[0] = vdcr - vdci;
  or[1] = 0;

  samples >>= 1;

  $if SAMPLE_TILE > 1:
    for (; samples >= ${SAMPLE_TILE}; samples -= ${SAMPLE_TILE}) {
      $for C in range(SAMPLE_TILE):
        int32_t vilr${C} = il[${C * 2 + 0}];
        int32_t vili${C} = il[${C * 2 + 1}];
      il += ${SAMPLE_TILE} * 2;
      ir -= ${SAMPLE_TILE} * 2;
      $for C in range(SAMPLE_TILE):
        int32_t virr${C} =  (int32_t) ir[${(SAMPLE_TILE - 1 - C) * 2 + 0}];
        int32_t viri${C} = -(int32_t) ir[${(SAMPLE_TILE - 1 - C) * 2 + 1}];
      $for C in range(SAMPLE_TILE):
        const int32_t vtwr${C} = twiddle[${C * 2 + 0}];
        const int32_t vtwi${C} = twiddle[${C * 2 + 1}];
      twiddle += ${SAMPLE_TILE} * 2;

      $for C in range(SAMPLE_TILE):
        vilr${C} = math_asr_s32(vilr${C} * 16383 + 16384, 15);
        virr${C} = math_asr_s32(virr${C} * 16383 + 16384, 15);
      $for C in range(SAMPLE_TILE):
        vili${C} = math_asr_s32(vili${C} * 16383 + 16384, 15);
        viri${C} = math_asr_s32(viri${C} * 16383 + 16384, 15);
      $for C in range(SAMPLE_TILE):
        const int32_t vacc1r${C} = vilr${C} + virr${C};
        const int32_t vacc2r${C} = vilr${C} - virr${C};
      $for C in range(SAMPLE_TILE):
        const int32_t vacc1i${C} = vili${C} + viri${C};
        const int32_t vacc2i${C} = vili${C} - viri${C};

      $for C in range(SAMPLE_TILE):
        const int32_t twr${C} = math_asr_s32(vacc2r${C} * vtwr${C} - vacc2i${C} * vtwi${C} + 16384, 15);
      $for C in range(SAMPLE_TILE):
        const int32_t twi${C} = math_asr_s32(vacc2r${C} * vtwi${C} + vacc2i${C} * vtwr${C} + 16384, 15);

      $for C in range(SAMPLE_TILE):
        ol[${C * 2 + 0}] = math_asr_s32(vacc1r${C} + twr${C}, 1);
        ol[${C * 2 + 1}] = math_asr_s32(vacc1i${C} + twi${C}, 1);
      ol += ${SAMPLE_TILE} * 2;
      or -= ${SAMPLE_TILE} * 2;
      $for C in range(SAMPLE_TILE):
        or[${(SAMPLE_TILE - 1 - C) * 2 + 0}] = math_asr_s32(vacc1r${C} - twr${C}, 1);
        or[${(SAMPLE_TILE - 1 - C) * 2 + 1}] = math_asr_s32(twi${C} - vacc1i${C}, 1);
    }

  if XNN_UNLIKELY(samples != 0) {
    do {
      int32_t vilr = il[0];
      int32_t vili = il[1];
      il += 2;
      ir -= 2;
      int32_t virr =  (int32_t) ir[0];
      int32_t viri = -(int32_t) ir[1];
      const int32_t vtwr = twiddle[0];
      const int32_t vtwi = twiddle[1];
      twiddle += 2;

      vilr =  math_asr_s32(vilr * 16383 + 16384, 15);
      vili =  math_asr_s32(vili * 16383 + 16384, 15);
      virr = math_asr_s32(virr * 16383 + 16384, 15);
      viri = math_asr_s32(viri * 16383 + 16384, 15);
      const int32_t vacc1r = vilr + virr;
      const int32_t vacc1i = vili + viri;
      const int32_t vacc2r = vilr - virr;
      const int32_t vacc2i = vili - viri;

      const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15);
      const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15);

      ol[0] = math_asr_s32(vacc1r + twr, 1);
      ol[1] = math_asr_s32(vacc1i + twi, 1);
      ol += 2;
      or -= 2;
      or[0] = math_asr_s32(vacc1r - twr, 1);
      or[1] = math_asr_s32(twi - vacc1i, 1);

    } while (--samples != 0);
  }
}