src/common_audio/vad/main/source/vad_core.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

/*
 *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


/*
 * This header file includes the descriptions of the core VAD calls.
 */

#ifndef WEBRTC_VAD_CORE_H_
#define WEBRTC_VAD_CORE_H_

#include "typedefs.h"
#include "vad_defines.h"

typedef struct VadInstT_
{

    WebRtc_Word16 vad;
    WebRtc_Word32 downsampling_filter_states[4];
    WebRtc_Word16 noise_means[NUM_TABLE_VALUES];
    WebRtc_Word16 speech_means[NUM_TABLE_VALUES];
    WebRtc_Word16 noise_stds[NUM_TABLE_VALUES];
    WebRtc_Word16 speech_stds[NUM_TABLE_VALUES];
    WebRtc_Word32 frame_counter;
    WebRtc_Word16 over_hang; // Over Hang
    WebRtc_Word16 num_of_speech;
    WebRtc_Word16 index_vector[16 * NUM_CHANNELS];
    WebRtc_Word16 low_value_vector[16 * NUM_CHANNELS];
    WebRtc_Word16 mean_value[NUM_CHANNELS];
    WebRtc_Word16 upper_state[5];
    WebRtc_Word16 lower_state[5];
    WebRtc_Word16 hp_filter_state[4];
    WebRtc_Word16 over_hang_max_1[3];
    WebRtc_Word16 over_hang_max_2[3];
    WebRtc_Word16 individual[3];
    WebRtc_Word16 total[3];

    short init_flag;

} VadInstT;

/****************************************************************************
 * WebRtcVad_InitCore(...)
 *
 * This function initializes a VAD instance
 *
 * Input:
 *      - inst      : Instance that should be initialized
 *      - mode      : Aggressiveness degree
 *                    0 (High quality) - 3 (Highly aggressive)
 *
 * Output:
 *      - inst      : Initialized instance
 *
 * Return value     :  0 - Ok
 *                    -1 - Error
 */
int WebRtcVad_InitCore(VadInstT* inst, short mode);

/****************************************************************************
 * WebRtcVad_set_mode_core(...)
 *
 * This function changes the VAD settings
 *
 * Input:
 *      - inst      : VAD instance
 *      - mode      : Aggressiveness degree
 *                    0 (High quality) - 3 (Highly aggressive)
 *
 * Output:
 *      - inst      : Changed  instance
 *
 * Return value     :  0 - Ok
 *                    -1 - Error
 */

int WebRtcVad_set_mode_core(VadInstT* inst, short mode);

/****************************************************************************
 * WebRtcVad_CalcVad32khz(...) 
 * WebRtcVad_CalcVad16khz(...) 
 * WebRtcVad_CalcVad8khz(...) 
 *
 * Calculate probability for active speech and make VAD decision.
 *
 * Input:
 *      - inst          : Instance that should be initialized
 *      - speech_frame  : Input speech frame
 *      - frame_length  : Number of input samples
 *
 * Output:
 *      - inst          : Updated filter states etc.
 *
 * Return value         : VAD decision
 *                        0 - No active speech
 *                        1-6 - Active speech
 */
WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT* inst, WebRtc_Word16* speech_frame,
                                     int frame_length);
WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT* inst, WebRtc_Word16* speech_frame,
                                     int frame_length);
WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT* inst, WebRtc_Word16* speech_frame,
                                    int frame_length);

/****************************************************************************
 * WebRtcVad_GmmProbability(...)
 *
 * This function calculates the probabilities for background noise and
 * speech using Gaussian Mixture Models. A hypothesis-test is performed to decide
 * which type of signal is most probable.
 *
 * Input:
 *      - inst              : Pointer to VAD instance
 *      - feature_vector    : Feature vector = log10(energy in frequency band)
 *      - total_power       : Total power in frame.
 *      - frame_length      : Number of input samples
 *
 * Output:
 *      VAD decision        : 0 - noise, 1 - speech
 *    
 */
WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT* inst, WebRtc_Word16* feature_vector,
                                       WebRtc_Word16 total_power, int frame_length);

#endif // WEBRTC_VAD_CORE_H_