libvpx/vpx_dsp/loongarch/loopfilter_lsx.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

/*
 *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_

#include "vpx_util/loongson_intrinsics.h"

#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
                     flat_out)                                               \
  do {                                                                       \
    __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
    __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
                                                                             \
    /* absolute subtraction of pixel values */                               \
    p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in);                             \
    p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in);                             \
    p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in);                             \
    q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in);                             \
    q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in);                             \
    q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in);                             \
    p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in);                             \
    p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in);                             \
                                                                             \
    /* calculation of hev */                                                 \
    flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);                    \
    hev_out = __lsx_vslt_bu(thresh_in, flat_out);                            \
                                                                             \
    /* calculation of mask */                                                \
    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);               \
    p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                           \
    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);               \
    mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m);                      \
    mask_out = __lsx_vmax_bu(flat_out, mask_out);                            \
    p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);                \
    mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out);                        \
    q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);                \
    mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out);                        \
                                                                             \
    mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
    mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
  } while (0)

#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)          \
  do {                                                                         \
    __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0;                    \
    __m128i flat4_tmp = __lsx_vldi(1);                                         \
                                                                               \
    DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \
              q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0);          \
    p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0);                        \
    flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out);                            \
    p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0);                        \
    flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out);                            \
                                                                               \
    flat_out = __lsx_vslt_bu(flat4_tmp, flat_out);                             \
    flat_out = __lsx_vxori_b(flat_out, 0xff);                                  \
    flat_out = flat_out & (mask);                                              \
  } while (0)

#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in,      \
                  q6_in, q7_in, flat_in, flat2_out)                            \
  do {                                                                         \
    __m128i flat5_tmp = __lsx_vldi(1);                                         \
    __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0;                    \
    __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0;                    \
    DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \
              q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0);          \
    DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \
              q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0);          \
                                                                               \
    DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0,   \
              p4_asub_p0, flat2_out);                                          \
    flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out);                          \
    p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0);                        \
    flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out);                          \
    p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0);                        \
    flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out);                          \
    flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out);                           \
    flat2_out = __lsx_vxori_b(flat2_out, 0xff);                                \
    flat2_out = flat2_out & flat_in;                                           \
  } while (0)

#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out,  \
                           p0_out, q0_out, q1_out)                         \
  do {                                                                     \
    __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2;               \
    const __m128i cnst4b = __lsx_vldi(4);                                  \
    const __m128i cnst3b = __lsx_vldi(3);                                  \
    DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \
              0x80, p1_m, p0_m, q0_m, q1_m);                               \
    filt = __lsx_vssub_b(p1_m, q1_m);                                      \
    filt &= hev;                                                           \
                                                                           \
    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);                                 \
    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
    filt &= mask;                                                          \
    DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2);          \
    DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2);                        \
                                                                           \
    q0_m = __lsx_vssub_b(q0_m, t1);                                        \
    p0_m = __lsx_vsadd_b(p0_m, t2);                                        \
    DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out);      \
                                                                           \
    filt = __lsx_vsrari_b(t1, 1);                                          \
    hev = __lsx_vxori_b(hev, 0xff);                                        \
    filt &= hev;                                                           \
    q1_m = __lsx_vssub_b(q1_m, filt);                                      \
    p1_m = __lsx_vsadd_b(p1_m, filt);                                      \
    DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out);      \
  } while (0)

#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
                    q1_filt8_out, q2_filt8_out)                             \
  do {                                                                      \
    __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                          \
                                                                            \
    tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in);                               \
    tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in);                         \
    tmp_filt8_0 = __lsx_vslli_h(p3_in, 1);                                  \
                                                                            \
    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2);                   \
    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in);                         \
    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in);                         \
    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in);                         \
    p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
                                                                            \
    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in);                         \
    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in);                         \
    p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
                                                                            \
    tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in);                               \
    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in);                         \
    tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1);                   \
    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in);                         \
    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in);                         \
    p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3);                          \
                                                                            \
    tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in);                               \
    tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0);                         \
    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
    tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in);                               \
    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0);                   \
    q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
                                                                            \
    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in);                         \
    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in);                         \
    q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
                                                                            \
    tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in);                         \
    tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in);                               \
    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
    q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
  } while (0)

#endif  // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_