aboutsummaryrefslogtreecommitdiff
path: root/src/cs16-fftr/gen/scalar-x2.c
blob: 83dc7c20c66027f76936489294ba095fc8d21b66 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// Auto-generated file. Do not edit!
//   Template: src/cs16-fftr/scalar.c.in
//   Generator: tools/xngen
//
// Copyright 2022 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#include <xnnpack/math.h>
#include <xnnpack/fft.h>


void xnn_cs16_fftr_ukernel__scalar_x2(
    size_t samples,
    int16_t* data,
    const int16_t* twiddle) {

  assert(samples >= 2);
  assert(samples % 2 == 0);
  assert(data != NULL);
  assert(data != NULL);
  assert(twiddle != NULL);

  int16_t* dl = data;
  int16_t* dr = data + samples * 2;
  int32_t vdcr = (int32_t) dl[0];
  int32_t vdci = (int32_t) dl[1];

  vdcr = math_asr_s32(vdcr * 16383 + 16384, 15);
  vdci = math_asr_s32(vdci * 16383 + 16384, 15);

  dl[0] = vdcr + vdci;
  dl[1] = 0;
  dl += 2;
  dr[0] = vdcr - vdci;
  dr[1] = 0;

  samples >>= 1;

  for (; samples >= 2; samples -= 2) {
    dr -= 2 * 2;
    int32_t vilr0 = dl[0];
    int32_t vili0 = dl[1];
    int32_t vilr1 = dl[2];
    int32_t vili1 = dl[3];
    int32_t virr0 =  (int32_t) dr[2];
    int32_t viri0 = -(int32_t) dr[3];
    int32_t virr1 =  (int32_t) dr[0];
    int32_t viri1 = -(int32_t) dr[1];
    const int32_t vtwr0 = twiddle[0];
    const int32_t vtwi0 = twiddle[1];
    const int32_t vtwr1 = twiddle[2];
    const int32_t vtwi1 = twiddle[3];
    twiddle += 2 * 2;

    vilr0 = math_asr_s32(vilr0 * 16383 + 16384, 15);
    virr0 = math_asr_s32(virr0 * 16383 + 16384, 15);
    vilr1 = math_asr_s32(vilr1 * 16383 + 16384, 15);
    virr1 = math_asr_s32(virr1 * 16383 + 16384, 15);
    vili0 = math_asr_s32(vili0 * 16383 + 16384, 15);
    viri0 = math_asr_s32(viri0 * 16383 + 16384, 15);
    vili1 = math_asr_s32(vili1 * 16383 + 16384, 15);
    viri1 = math_asr_s32(viri1 * 16383 + 16384, 15);
    const int32_t vacc1r0 = vilr0 + virr0;
    const int32_t vacc2r0 = vilr0 - virr0;
    const int32_t vacc1r1 = vilr1 + virr1;
    const int32_t vacc2r1 = vilr1 - virr1;
    const int32_t vacc1i0 = vili0 + viri0;
    const int32_t vacc2i0 = vili0 - viri0;
    const int32_t vacc1i1 = vili1 + viri1;
    const int32_t vacc2i1 = vili1 - viri1;

    const int32_t twr0 = math_asr_s32(vacc2r0 * vtwr0 - vacc2i0 * vtwi0 + 16384, 15);
    const int32_t twr1 = math_asr_s32(vacc2r1 * vtwr1 - vacc2i1 * vtwi1 + 16384, 15);
    const int32_t twi0 = math_asr_s32(vacc2r0 * vtwi0 + vacc2i0 * vtwr0 + 16384, 15);
    const int32_t twi1 = math_asr_s32(vacc2r1 * vtwi1 + vacc2i1 * vtwr1 + 16384, 15);

    dl[0] = math_asr_s32(vacc1r0 + twr0, 1);
    dl[1] = math_asr_s32(vacc1i0 + twi0, 1);
    dl[2] = math_asr_s32(vacc1r1 + twr1, 1);
    dl[3] = math_asr_s32(vacc1i1 + twi1, 1);
    dr[2] = math_asr_s32(vacc1r0 - twr0, 1);
    dr[3] = math_asr_s32(twi0 - vacc1i0, 1);
    dr[0] = math_asr_s32(vacc1r1 - twr1, 1);
    dr[1] = math_asr_s32(twi1 - vacc1i1, 1);
    dl += 2 * 2;
  }

  if XNN_UNLIKELY(samples != 0) {
    do {
      dr -= 2;
      int32_t vilr = dl[0];
      int32_t vili = dl[1];
      int32_t virr =  (int32_t) dr[0];
      int32_t viri = -(int32_t) dr[1];
      const int32_t vtwr = twiddle[0];
      const int32_t vtwi = twiddle[1];
      twiddle += 2;

      vilr =  math_asr_s32(vilr * 16383 + 16384, 15);
      vili =  math_asr_s32(vili * 16383 + 16384, 15);
      virr = math_asr_s32(virr * 16383 + 16384, 15);
      viri = math_asr_s32(viri * 16383 + 16384, 15);
      const int32_t vacc1r = vilr + virr;
      const int32_t vacc1i = vili + viri;
      const int32_t vacc2r = vilr - virr;
      const int32_t vacc2i = vili - viri;

      const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15);
      const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15);

      dl[0] = math_asr_s32(vacc1r + twr, 1);
      dl[1] = math_asr_s32(vacc1i + twi, 1);
      dr[0] = math_asr_s32(vacc1r - twr, 1);
      dr[1] = math_asr_s32(twi - vacc1i, 1);
      dl += 2;
    } while (--samples != 0);
  }
}