aboutsummaryrefslogtreecommitdiff
path: root/webrtc/modules/audio_processing/vad/pitch_based_vad.cc
blob: fce144de6b0a3cc2c57f68509316687c77f24940 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*
 *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"

#include <assert.h>
#include <math.h>
#include <string.h>

#include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h"
#include "webrtc/modules/audio_processing/vad/common.h"
#include "webrtc/modules/audio_processing/vad/noise_gmm_tables.h"
#include "webrtc/modules/audio_processing/vad/voice_gmm_tables.h"
#include "webrtc/modules/include/module_common_types.h"

namespace webrtc {

static_assert(kNoiseGmmDim == kVoiceGmmDim,
              "noise and voice gmm dimension not equal");

// These values should match MATLAB counterparts for unit-tests to pass.
static const int kPosteriorHistorySize = 500;  // 5 sec of 10 ms frames.
static const double kInitialPriorProbability = 0.3;
static const int kTransientWidthThreshold = 7;
static const double kLowProbabilityThreshold = 0.2;

static double LimitProbability(double p) {
  const double kLimHigh = 0.99;
  const double kLimLow = 0.01;

  if (p > kLimHigh)
    p = kLimHigh;
  else if (p < kLimLow)
    p = kLimLow;
  return p;
}

PitchBasedVad::PitchBasedVad()
    : p_prior_(kInitialPriorProbability),
      circular_buffer_(VadCircularBuffer::Create(kPosteriorHistorySize)) {
  // Setup noise GMM.
  noise_gmm_.dimension = kNoiseGmmDim;
  noise_gmm_.num_mixtures = kNoiseGmmNumMixtures;
  noise_gmm_.weight = kNoiseGmmWeights;
  noise_gmm_.mean = &kNoiseGmmMean[0][0];
  noise_gmm_.covar_inverse = &kNoiseGmmCovarInverse[0][0][0];

  // Setup voice GMM.
  voice_gmm_.dimension = kVoiceGmmDim;
  voice_gmm_.num_mixtures = kVoiceGmmNumMixtures;
  voice_gmm_.weight = kVoiceGmmWeights;
  voice_gmm_.mean = &kVoiceGmmMean[0][0];
  voice_gmm_.covar_inverse = &kVoiceGmmCovarInverse[0][0][0];
}

PitchBasedVad::~PitchBasedVad() {
}

int PitchBasedVad::VoicingProbability(const AudioFeatures& features,
                                      double* p_combined) {
  double p;
  double gmm_features[3];
  double pdf_features_given_voice;
  double pdf_features_given_noise;
  // These limits are the same in matlab implementation 'VoicingProbGMM().'
  const double kLimLowLogPitchGain = -2.0;
  const double kLimHighLogPitchGain = -0.9;
  const double kLimLowSpectralPeak = 200;
  const double kLimHighSpectralPeak = 2000;
  const double kEps = 1e-12;
  for (size_t n = 0; n < features.num_frames; n++) {
    gmm_features[0] = features.log_pitch_gain[n];
    gmm_features[1] = features.spectral_peak[n];
    gmm_features[2] = features.pitch_lag_hz[n];

    pdf_features_given_voice = EvaluateGmm(gmm_features, voice_gmm_);
    pdf_features_given_noise = EvaluateGmm(gmm_features, noise_gmm_);

    if (features.spectral_peak[n] < kLimLowSpectralPeak ||
        features.spectral_peak[n] > kLimHighSpectralPeak ||
        features.log_pitch_gain[n] < kLimLowLogPitchGain) {
      pdf_features_given_voice = kEps * pdf_features_given_noise;
    } else if (features.log_pitch_gain[n] > kLimHighLogPitchGain) {
      pdf_features_given_noise = kEps * pdf_features_given_voice;
    }

    p = p_prior_ * pdf_features_given_voice /
        (pdf_features_given_voice * p_prior_ +
         pdf_features_given_noise * (1 - p_prior_));

    p = LimitProbability(p);

    // Combine pitch-based probability with standalone probability, before
    // updating prior probabilities.
    double prod_active = p * p_combined[n];
    double prod_inactive = (1 - p) * (1 - p_combined[n]);
    p_combined[n] = prod_active / (prod_active + prod_inactive);

    if (UpdatePrior(p_combined[n]) < 0)
      return -1;
    // Limit prior probability. With a zero prior probability the posterior
    // probability is always zero.
    p_prior_ = LimitProbability(p_prior_);
  }
  return 0;
}

int PitchBasedVad::UpdatePrior(double p) {
  circular_buffer_->Insert(p);
  if (circular_buffer_->RemoveTransient(kTransientWidthThreshold,
                                        kLowProbabilityThreshold) < 0)
    return -1;
  p_prior_ = circular_buffer_->Mean();
  return 0;
}

}  // namespace webrtc