aboutsummaryrefslogtreecommitdiff
path: root/webrtc/modules/audio_processing/vad
diff options
context:
space:
mode:
Diffstat (limited to 'webrtc/modules/audio_processing/vad')
-rw-r--r--webrtc/modules/audio_processing/vad/Android.mk52
-rw-r--r--webrtc/modules/audio_processing/vad/common.h27
-rw-r--r--webrtc/modules/audio_processing/vad/gmm.cc64
-rw-r--r--webrtc/modules/audio_processing/vad/gmm.h45
-rw-r--r--webrtc/modules/audio_processing/vad/gmm_unittest.cc65
-rw-r--r--webrtc/modules/audio_processing/vad/noise_gmm_tables.h85
-rw-r--r--webrtc/modules/audio_processing/vad/pitch_based_vad.cc124
-rw-r--r--webrtc/modules/audio_processing/vad/pitch_based_vad.h57
-rw-r--r--webrtc/modules/audio_processing/vad/pitch_based_vad_unittest.cc75
-rw-r--r--webrtc/modules/audio_processing/vad/pitch_internal.cc51
-rw-r--r--webrtc/modules/audio_processing/vad/pitch_internal.h26
-rw-r--r--webrtc/modules/audio_processing/vad/pitch_internal_unittest.cc50
-rw-r--r--webrtc/modules/audio_processing/vad/pole_zero_filter.cc106
-rw-r--r--webrtc/modules/audio_processing/vad/pole_zero_filter.h52
-rw-r--r--webrtc/modules/audio_processing/vad/pole_zero_filter_unittest.cc102
-rw-r--r--webrtc/modules/audio_processing/vad/standalone_vad.cc93
-rw-r--r--webrtc/modules/audio_processing/vad/standalone_vad.h70
-rw-r--r--webrtc/modules/audio_processing/vad/standalone_vad_unittest.cc104
-rw-r--r--webrtc/modules/audio_processing/vad/vad_audio_proc.cc275
-rw-r--r--webrtc/modules/audio_processing/vad/vad_audio_proc.h89
-rw-r--r--webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h94
-rw-r--r--webrtc/modules/audio_processing/vad/vad_audio_proc_unittest.cc63
-rw-r--r--webrtc/modules/audio_processing/vad/vad_circular_buffer.cc138
-rw-r--r--webrtc/modules/audio_processing/vad/vad_circular_buffer.h69
-rw-r--r--webrtc/modules/audio_processing/vad/vad_circular_buffer_unittest.cc133
-rw-r--r--webrtc/modules/audio_processing/vad/voice_activity_detector.cc85
-rw-r--r--webrtc/modules/audio_processing/vad/voice_activity_detector.h70
-rw-r--r--webrtc/modules/audio_processing/vad/voice_activity_detector_unittest.cc168
-rw-r--r--webrtc/modules/audio_processing/vad/voice_gmm_tables.h85
29 files changed, 2517 insertions, 0 deletions
diff --git a/webrtc/modules/audio_processing/vad/Android.mk b/webrtc/modules/audio_processing/vad/Android.mk
new file mode 100644
index 0000000000..98909b0dda
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/Android.mk
@@ -0,0 +1,52 @@
+# Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+include $(LOCAL_PATH)/../../../../android-webrtc.mk
+
+LOCAL_ARM_MODE := arm
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+LOCAL_MODULE := libwebrtc_apvad
+LOCAL_MODULE_TAGS := optional
+LOCAL_CPP_EXTENSION := .cc
+LOCAL_SRC_FILES := \
+ gmm.cc \
+ pitch_based_vad.cc \
+ pitch_internal.cc \
+ pole_zero_filter.cc \
+ standalone_vad.cc \
+ vad_audio_proc.cc \
+ vad_circular_buffer.cc \
+ voice_activity_detector.cc \
+
+# Flags passed to both C and C++ files.
+LOCAL_CFLAGS := \
+ $(MY_WEBRTC_COMMON_DEFS)
+
+LOCAL_CFLAGS_arm := $(MY_WEBRTC_COMMON_DEFS_arm)
+LOCAL_CFLAGS_x86 := $(MY_WEBRTC_COMMON_DEFS_x86)
+LOCAL_CFLAGS_mips := $(MY_WEBRTC_COMMON_DEFS_mips)
+LOCAL_CFLAGS_arm64 := $(MY_WEBRTC_COMMON_DEFS_arm64)
+LOCAL_CFLAGS_x86_64 := $(MY_WEBRTC_COMMON_DEFS_x86_64)
+LOCAL_CFLAGS_mips64 := $(MY_WEBRTC_COMMON_DEFS_mips64)
+
+# Include paths placed before CFLAGS/CPPFLAGS
+LOCAL_C_INCLUDES := \
+ $(LOCAL_PATH) \
+ $(LOCAL_PATH)/../../../.. \
+
+ifdef WEBRTC_STL
+LOCAL_NDK_STL_VARIANT := $(WEBRTC_STL)
+LOCAL_SDK_VERSION := 14
+LOCAL_MODULE := $(LOCAL_MODULE)_$(WEBRTC_STL)
+endif
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/webrtc/modules/audio_processing/vad/common.h b/webrtc/modules/audio_processing/vad/common.h
new file mode 100644
index 0000000000..be99c1c59d
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/common.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_COMMON_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_COMMON_H_
+
+static const int kSampleRateHz = 16000;
+static const size_t kLength10Ms = kSampleRateHz / 100;
+static const size_t kMaxNumFrames = 4;
+
+struct AudioFeatures {
+ double log_pitch_gain[kMaxNumFrames];
+ double pitch_lag_hz[kMaxNumFrames];
+ double spectral_peak[kMaxNumFrames];
+ double rms[kMaxNumFrames];
+ size_t num_frames;
+ bool silence;
+};
+
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_COMMON_H_
diff --git a/webrtc/modules/audio_processing/vad/gmm.cc b/webrtc/modules/audio_processing/vad/gmm.cc
new file mode 100644
index 0000000000..9651975913
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/gmm.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/gmm.h"
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "webrtc/typedefs.h"
+
+namespace webrtc {
+
+static const int kMaxDimension = 10;
+
+static void RemoveMean(const double* in,
+ const double* mean_vec,
+ int dimension,
+ double* out) {
+ for (int n = 0; n < dimension; ++n)
+ out[n] = in[n] - mean_vec[n];
+}
+
+static double ComputeExponent(const double* in,
+ const double* covar_inv,
+ int dimension) {
+ double q = 0;
+ for (int i = 0; i < dimension; ++i) {
+ double v = 0;
+ for (int j = 0; j < dimension; j++)
+ v += (*covar_inv++) * in[j];
+ q += v * in[i];
+ }
+ q *= -0.5;
+ return q;
+}
+
+double EvaluateGmm(const double* x, const GmmParameters& gmm_parameters) {
+ if (gmm_parameters.dimension > kMaxDimension) {
+ return -1; // This is invalid pdf so the caller can check this.
+ }
+ double f = 0;
+ double v[kMaxDimension];
+ const double* mean_vec = gmm_parameters.mean;
+ const double* covar_inv = gmm_parameters.covar_inverse;
+
+ for (int n = 0; n < gmm_parameters.num_mixtures; n++) {
+ RemoveMean(x, mean_vec, gmm_parameters.dimension, v);
+ double q = ComputeExponent(v, covar_inv, gmm_parameters.dimension) +
+ gmm_parameters.weight[n];
+ f += exp(q);
+ mean_vec += gmm_parameters.dimension;
+ covar_inv += gmm_parameters.dimension * gmm_parameters.dimension;
+ }
+ return f;
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/gmm.h b/webrtc/modules/audio_processing/vad/gmm.h
new file mode 100644
index 0000000000..9f3e578fef
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/gmm.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_GMM_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_GMM_H_
+
+namespace webrtc {
+
+// A structure that specifies a GMM.
+// A GMM is formulated as
+// f(x) = w[0] * mixture[0] + w[1] * mixture[1] + ... +
+// w[num_mixtures - 1] * mixture[num_mixtures - 1];
+// Where a 'mixture' is a Gaussian density.
+
+struct GmmParameters {
+ // weight[n] = log(w[n]) - |dimension|/2 * log(2*pi) - 1/2 * log(det(cov[n]));
+ // where cov[n] is the covariance matrix of mixture n;
+ const double* weight;
+ // pointer to the first element of a |num_mixtures|x|dimension| matrix
+ // where kth row is the mean of the kth mixture.
+ const double* mean;
+ // pointer to the first element of a |num_mixtures|x|dimension|x|dimension|
+ // 3D-matrix, where the kth 2D-matrix is the inverse of the covariance
+ // matrix of the kth mixture.
+ const double* covar_inverse;
+ // Dimensionality of the mixtures.
+ int dimension;
+ // number of the mixtures.
+ int num_mixtures;
+};
+
+// Evaluate the given GMM, according to |gmm_parameters|, at the given point
+// |x|. If the dimensionality of the given GMM is larger that the maximum
+// acceptable dimension by the following function -1 is returned.
+double EvaluateGmm(const double* x, const GmmParameters& gmm_parameters);
+
+} // namespace webrtc
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_GMM_H_
diff --git a/webrtc/modules/audio_processing/vad/gmm_unittest.cc b/webrtc/modules/audio_processing/vad/gmm_unittest.cc
new file mode 100644
index 0000000000..f8e1bde776
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/gmm_unittest.cc
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/gmm.h"
+
+#include <math.h>
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "webrtc/modules/audio_processing/vad/noise_gmm_tables.h"
+#include "webrtc/modules/audio_processing/vad/voice_gmm_tables.h"
+
+namespace webrtc {
+
+TEST(GmmTest, EvaluateGmm) {
+ GmmParameters noise_gmm;
+ GmmParameters voice_gmm;
+
+ // Setup noise GMM.
+ noise_gmm.dimension = kNoiseGmmDim;
+ noise_gmm.num_mixtures = kNoiseGmmNumMixtures;
+ noise_gmm.weight = kNoiseGmmWeights;
+ noise_gmm.mean = &kNoiseGmmMean[0][0];
+ noise_gmm.covar_inverse = &kNoiseGmmCovarInverse[0][0][0];
+
+ // Setup voice GMM.
+ voice_gmm.dimension = kVoiceGmmDim;
+ voice_gmm.num_mixtures = kVoiceGmmNumMixtures;
+ voice_gmm.weight = kVoiceGmmWeights;
+ voice_gmm.mean = &kVoiceGmmMean[0][0];
+ voice_gmm.covar_inverse = &kVoiceGmmCovarInverse[0][0][0];
+
+ // Test vectors. These are the mean of the GMM means.
+ const double kXVoice[kVoiceGmmDim] = {
+ -1.35893162459863, 602.862491970368, 178.022069191324};
+ const double kXNoise[kNoiseGmmDim] = {
+ -2.33443722724409, 2827.97828765184, 141.114178166812};
+
+ // Expected pdf values. These values are computed in MATLAB using EvalGmm.m
+ const double kPdfNoise = 1.88904409403101e-07;
+ const double kPdfVoice = 1.30453996982266e-06;
+
+ // Relative error should be smaller that the following value.
+ const double kAcceptedRelativeErr = 1e-10;
+
+ // Test Voice.
+ double pdf = EvaluateGmm(kXVoice, voice_gmm);
+ EXPECT_GT(pdf, 0);
+ double relative_error = fabs(pdf - kPdfVoice) / kPdfVoice;
+ EXPECT_LE(relative_error, kAcceptedRelativeErr);
+
+ // Test Noise.
+ pdf = EvaluateGmm(kXNoise, noise_gmm);
+ EXPECT_GT(pdf, 0);
+ relative_error = fabs(pdf - kPdfNoise) / kPdfNoise;
+ EXPECT_LE(relative_error, kAcceptedRelativeErr);
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/noise_gmm_tables.h b/webrtc/modules/audio_processing/vad/noise_gmm_tables.h
new file mode 100644
index 0000000000..293af57a2a
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/noise_gmm_tables.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// GMM tables for inactive segments. Generated by MakeGmmTables.m.
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_NOISE_GMM_TABLES_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_NOISE_GMM_TABLES_H_
+
+static const int kNoiseGmmNumMixtures = 12;
+static const int kNoiseGmmDim = 3;
+
+static const double
+ kNoiseGmmCovarInverse[kNoiseGmmNumMixtures][kNoiseGmmDim][kNoiseGmmDim] = {
+ {{7.36219567592941e+00, 4.83060785179861e-03, 1.23335151497610e-02},
+ {4.83060785179861e-03, 1.65289507047817e-04, -2.41490588169997e-04},
+ {1.23335151497610e-02, -2.41490588169997e-04, 6.59472060689382e-03}},
+ {{8.70265239309140e+00, -5.30636201431086e-04, 5.44014966585347e-03},
+ {-5.30636201431086e-04, 3.11095453521008e-04, -1.86287206836035e-04},
+ {5.44014966585347e-03, -1.86287206836035e-04, 6.29493388790744e-04}},
+ {{4.53467851955055e+00, -3.92977536695197e-03, -2.46521420693317e-03},
+ {-3.92977536695197e-03, 4.94650752632750e-05, -1.08587438501826e-05},
+ {-2.46521420693317e-03, -1.08587438501826e-05, 9.28793975422261e-05}},
+ {{9.26817997114275e-01, -4.03976069276753e-04, -3.56441427392165e-03},
+ {-4.03976069276753e-04, 2.51976251631430e-06, 1.46914206734572e-07},
+ {-3.56441427392165e-03, 1.46914206734572e-07, 8.19914567685373e-05}},
+ {{7.61715986787441e+00, -1.54889041216888e-04, 2.41756280071656e-02},
+ {-1.54889041216888e-04, 3.50282550461672e-07, -6.27251196972490e-06},
+ {2.41756280071656e-02, -6.27251196972490e-06, 1.45061847649872e-02}},
+ {{8.31193642663158e+00, -3.84070508164323e-04, -3.09750630821876e-02},
+ {-3.84070508164323e-04, 3.80433432277336e-07, -1.14321142836636e-06},
+ {-3.09750630821876e-02, -1.14321142836636e-06, 8.35091486289997e-04}},
+ {{9.67283151270894e-01, 5.82465812445039e-05, -3.18350798617053e-03},
+ {5.82465812445039e-05, 2.23762672000318e-07, -7.74196587408623e-07},
+ {-3.18350798617053e-03, -7.74196587408623e-07, 3.85120938338325e-04}},
+ {{8.28066236985388e+00, 5.87634508319763e-05, 6.99303090891743e-03},
+ {5.87634508319763e-05, 2.93746018618058e-07, 3.40843332882272e-07},
+ {6.99303090891743e-03, 3.40843332882272e-07, 1.99379171190344e-04}},
+ {{6.07488998675646e+00, -1.11494526618473e-02, 5.10013111123381e-03},
+ {-1.11494526618473e-02, 6.99238879921751e-04, 5.36718550370870e-05},
+ {5.10013111123381e-03, 5.36718550370870e-05, 5.26909853276753e-04}},
+ {{6.90492021419175e+00, 4.20639355257863e-04, -2.38612752336481e-03},
+ {4.20639355257863e-04, 3.31246767338153e-06, -2.42052288150859e-08},
+ {-2.38612752336481e-03, -2.42052288150859e-08, 4.46608368363412e-04}},
+ {{1.31069150869715e+01, -1.73718583865670e-04, -1.97591814508578e-02},
+ {-1.73718583865670e-04, 2.80451716300124e-07, 9.96570755379865e-07},
+ {-1.97591814508578e-02, 9.96570755379865e-07, 2.41361900868847e-03}},
+ {{4.69566344239814e+00, -2.61077567563690e-04, 5.26359000761433e-03},
+ {-2.61077567563690e-04, 1.82420859823767e-06, -7.83645887541601e-07},
+ {5.26359000761433e-03, -7.83645887541601e-07, 1.33586288288802e-02}}};
+
+static const double kNoiseGmmMean[kNoiseGmmNumMixtures][kNoiseGmmDim] = {
+ {-2.01386094766163e+00, 1.69702162045397e+02, 7.41715804872181e+01},
+ {-1.94684591777290e+00, 1.42398396732668e+02, 1.64186321157831e+02},
+ {-2.29319297562437e+00, 3.86415425589868e+02, 2.13452215267125e+02},
+ {-3.25487177070268e+00, 1.08668712553616e+03, 2.33119949467419e+02},
+ {-2.13159632447467e+00, 4.83821702557717e+03, 6.86786166673740e+01},
+ {-2.26171410780526e+00, 4.79420193982422e+03, 1.53222513286450e+02},
+ {-3.32166740703185e+00, 4.35161135834358e+03, 1.33206448431316e+02},
+ {-2.19290322814343e+00, 3.98325506609408e+03, 2.13249167359934e+02},
+ {-2.02898459255404e+00, 7.37039893155007e+03, 1.12518527491926e+02},
+ {-2.26150236399500e+00, 1.54896745196145e+03, 1.49717357868579e+02},
+ {-2.00417668301790e+00, 3.82434760310304e+03, 1.07438913004312e+02},
+ {-2.30193040814533e+00, 1.43953696546439e+03, 7.04085275122649e+01}};
+
+static const double kNoiseGmmWeights[kNoiseGmmNumMixtures] = {
+ -1.09422832086193e+01,
+ -1.10847897513425e+01,
+ -1.36767587732187e+01,
+ -1.79789356118641e+01,
+ -1.42830169160894e+01,
+ -1.56500228061379e+01,
+ -1.83124990950113e+01,
+ -1.69979436177477e+01,
+ -1.12329424387828e+01,
+ -1.41311785780639e+01,
+ -1.47171861448585e+01,
+ -1.35963362781839e+01};
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_NOISE_GMM_TABLES_H_
diff --git a/webrtc/modules/audio_processing/vad/pitch_based_vad.cc b/webrtc/modules/audio_processing/vad/pitch_based_vad.cc
new file mode 100644
index 0000000000..39ec37e6ec
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/pitch_based_vad.cc
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h"
+#include "webrtc/modules/audio_processing/vad/common.h"
+#include "webrtc/modules/audio_processing/vad/noise_gmm_tables.h"
+#include "webrtc/modules/audio_processing/vad/voice_gmm_tables.h"
+#include "webrtc/modules/interface/module_common_types.h"
+
+namespace webrtc {
+
+static_assert(kNoiseGmmDim == kVoiceGmmDim,
+ "noise and voice gmm dimension not equal");
+
+// These values should match MATLAB counterparts for unit-tests to pass.
+static const int kPosteriorHistorySize = 500; // 5 sec of 10 ms frames.
+static const double kInitialPriorProbability = 0.3;
+static const int kTransientWidthThreshold = 7;
+static const double kLowProbabilityThreshold = 0.2;
+
+static double LimitProbability(double p) {
+ const double kLimHigh = 0.99;
+ const double kLimLow = 0.01;
+
+ if (p > kLimHigh)
+ p = kLimHigh;
+ else if (p < kLimLow)
+ p = kLimLow;
+ return p;
+}
+
+PitchBasedVad::PitchBasedVad()
+ : p_prior_(kInitialPriorProbability),
+ circular_buffer_(VadCircularBuffer::Create(kPosteriorHistorySize)) {
+ // Setup noise GMM.
+ noise_gmm_.dimension = kNoiseGmmDim;
+ noise_gmm_.num_mixtures = kNoiseGmmNumMixtures;
+ noise_gmm_.weight = kNoiseGmmWeights;
+ noise_gmm_.mean = &kNoiseGmmMean[0][0];
+ noise_gmm_.covar_inverse = &kNoiseGmmCovarInverse[0][0][0];
+
+ // Setup voice GMM.
+ voice_gmm_.dimension = kVoiceGmmDim;
+ voice_gmm_.num_mixtures = kVoiceGmmNumMixtures;
+ voice_gmm_.weight = kVoiceGmmWeights;
+ voice_gmm_.mean = &kVoiceGmmMean[0][0];
+ voice_gmm_.covar_inverse = &kVoiceGmmCovarInverse[0][0][0];
+}
+
+PitchBasedVad::~PitchBasedVad() {
+}
+
+int PitchBasedVad::VoicingProbability(const AudioFeatures& features,
+ double* p_combined) {
+ double p;
+ double gmm_features[3];
+ double pdf_features_given_voice;
+ double pdf_features_given_noise;
+ // These limits are the same in matlab implementation 'VoicingProbGMM().'
+ const double kLimLowLogPitchGain = -2.0;
+ const double kLimHighLogPitchGain = -0.9;
+ const double kLimLowSpectralPeak = 200;
+ const double kLimHighSpectralPeak = 2000;
+ const double kEps = 1e-12;
+ for (size_t n = 0; n < features.num_frames; n++) {
+ gmm_features[0] = features.log_pitch_gain[n];
+ gmm_features[1] = features.spectral_peak[n];
+ gmm_features[2] = features.pitch_lag_hz[n];
+
+ pdf_features_given_voice = EvaluateGmm(gmm_features, voice_gmm_);
+ pdf_features_given_noise = EvaluateGmm(gmm_features, noise_gmm_);
+
+ if (features.spectral_peak[n] < kLimLowSpectralPeak ||
+ features.spectral_peak[n] > kLimHighSpectralPeak ||
+ features.log_pitch_gain[n] < kLimLowLogPitchGain) {
+ pdf_features_given_voice = kEps * pdf_features_given_noise;
+ } else if (features.log_pitch_gain[n] > kLimHighLogPitchGain) {
+ pdf_features_given_noise = kEps * pdf_features_given_voice;
+ }
+
+ p = p_prior_ * pdf_features_given_voice /
+ (pdf_features_given_voice * p_prior_ +
+ pdf_features_given_noise * (1 - p_prior_));
+
+ p = LimitProbability(p);
+
+ // Combine pitch-based probability with standalone probability, before
+ // updating prior probabilities.
+ double prod_active = p * p_combined[n];
+ double prod_inactive = (1 - p) * (1 - p_combined[n]);
+ p_combined[n] = prod_active / (prod_active + prod_inactive);
+
+ if (UpdatePrior(p_combined[n]) < 0)
+ return -1;
+ // Limit prior probability. With a zero prior probability the posterior
+ // probability is always zero.
+ p_prior_ = LimitProbability(p_prior_);
+ }
+ return 0;
+}
+
+int PitchBasedVad::UpdatePrior(double p) {
+ circular_buffer_->Insert(p);
+ if (circular_buffer_->RemoveTransient(kTransientWidthThreshold,
+ kLowProbabilityThreshold) < 0)
+ return -1;
+ p_prior_ = circular_buffer_->Mean();
+ return 0;
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/pitch_based_vad.h b/webrtc/modules/audio_processing/vad/pitch_based_vad.h
new file mode 100644
index 0000000000..c502184aea
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/pitch_based_vad.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_BASED_VAD_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_BASED_VAD_H_
+
+#include "webrtc/base/scoped_ptr.h"
+#include "webrtc/modules/audio_processing/vad/common.h"
+#include "webrtc/modules/audio_processing/vad/gmm.h"
+#include "webrtc/typedefs.h"
+
+namespace webrtc {
+
+class AudioFrame;
+class VadCircularBuffer;
+
+// Computes the probability of the input audio frame to be active given
+// the corresponding pitch-gain and lag of the frame.
+class PitchBasedVad {
+ public:
+ PitchBasedVad();
+ ~PitchBasedVad();
+
+ // Compute pitch-based voicing probability, given the features.
+ // features: a structure containing features required for computing voicing
+ // probabilities.
+ //
+ // p_combined: an array which contains the combined activity probabilities
+ // computed prior to the call of this function. The method,
+ // then, computes the voicing probabilities and combine them
+ // with the given values. The result are returned in |p|.
+ int VoicingProbability(const AudioFeatures& features, double* p_combined);
+
+ private:
+ int UpdatePrior(double p);
+
+ // TODO(turajs): maybe defining this at a higher level (maybe enum) so that
+ // all the code recognize it as "no-error."
+ static const int kNoError = 0;
+
+ GmmParameters noise_gmm_;
+ GmmParameters voice_gmm_;
+
+ double p_prior_;
+
+ rtc::scoped_ptr<VadCircularBuffer> circular_buffer_;
+};
+
+} // namespace webrtc
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_BASED_VAD_H_
diff --git a/webrtc/modules/audio_processing/vad/pitch_based_vad_unittest.cc b/webrtc/modules/audio_processing/vad/pitch_based_vad_unittest.cc
new file mode 100644
index 0000000000..04ddcab5cb
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/pitch_based_vad_unittest.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include <string>
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "webrtc/test/testsupport/fileutils.h"
+
+namespace webrtc {
+
+TEST(PitchBasedVadTest, VoicingProbabilityTest) {
+ std::string spectral_peak_file_name =
+ test::ResourcePath("audio_processing/agc/agc_spectral_peak", "dat");
+ FILE* spectral_peak_file = fopen(spectral_peak_file_name.c_str(), "rb");
+ ASSERT_TRUE(spectral_peak_file != NULL);
+
+ std::string pitch_gain_file_name =
+ test::ResourcePath("audio_processing/agc/agc_pitch_gain", "dat");
+ FILE* pitch_gain_file = fopen(pitch_gain_file_name.c_str(), "rb");
+ ASSERT_TRUE(pitch_gain_file != NULL);
+
+ std::string pitch_lag_file_name =
+ test::ResourcePath("audio_processing/agc/agc_pitch_lag", "dat");
+ FILE* pitch_lag_file = fopen(pitch_lag_file_name.c_str(), "rb");
+ ASSERT_TRUE(pitch_lag_file != NULL);
+
+ std::string voicing_prob_file_name =
+ test::ResourcePath("audio_processing/agc/agc_voicing_prob", "dat");
+ FILE* voicing_prob_file = fopen(voicing_prob_file_name.c_str(), "rb");
+ ASSERT_TRUE(voicing_prob_file != NULL);
+
+ PitchBasedVad vad_;
+
+ double reference_activity_probability;
+
+ AudioFeatures audio_features;
+ memset(&audio_features, 0, sizeof(audio_features));
+ audio_features.num_frames = 1;
+ while (fread(audio_features.spectral_peak,
+ sizeof(audio_features.spectral_peak[0]), 1,
+ spectral_peak_file) == 1u) {
+ double p;
+ ASSERT_EQ(1u, fread(audio_features.log_pitch_gain,
+ sizeof(audio_features.log_pitch_gain[0]), 1,
+ pitch_gain_file));
+ ASSERT_EQ(1u,
+ fread(audio_features.pitch_lag_hz,
+ sizeof(audio_features.pitch_lag_hz[0]), 1, pitch_lag_file));
+ ASSERT_EQ(1u, fread(&reference_activity_probability,
+ sizeof(reference_activity_probability), 1,
+ voicing_prob_file));
+
+ p = 0.5; // Initialize to the neutral value for combining probabilities.
+ EXPECT_EQ(0, vad_.VoicingProbability(audio_features, &p));
+ EXPECT_NEAR(p, reference_activity_probability, 0.01);
+ }
+
+ fclose(spectral_peak_file);
+ fclose(pitch_gain_file);
+ fclose(pitch_lag_file);
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/pitch_internal.cc b/webrtc/modules/audio_processing/vad/pitch_internal.cc
new file mode 100644
index 0000000000..309b45acf5
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/pitch_internal.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/pitch_internal.h"
+
+#include <cmath>
+
+// A 4-to-3 linear interpolation.
+// The interpolation constants are derived as following:
+// Input pitch parameters are updated every 7.5 ms. Within a 30-ms interval
+// we are interested in pitch parameters of 0-5 ms, 10-15ms and 20-25ms. This is
+// like interpolating 4-to-6 and keep the odd samples.
+// The reason behind this is that LPC coefficients are computed for the first
+// half of each 10ms interval.
+static void PitchInterpolation(double old_val, const double* in, double* out) {
+ out[0] = 1. / 6. * old_val + 5. / 6. * in[0];
+ out[1] = 5. / 6. * in[1] + 1. / 6. * in[2];
+ out[2] = 0.5 * in[2] + 0.5 * in[3];
+}
+
+void GetSubframesPitchParameters(int sampling_rate_hz,
+ double* gains,
+ double* lags,
+ int num_in_frames,
+ int num_out_frames,
+ double* log_old_gain,
+ double* old_lag,
+ double* log_pitch_gain,
+ double* pitch_lag_hz) {
+ // Gain interpolation is in log-domain, also returned in log-domain.
+ for (int n = 0; n < num_in_frames; n++)
+ gains[n] = log(gains[n] + 1e-12);
+
+ // Interpolate lags and gains.
+ PitchInterpolation(*log_old_gain, gains, log_pitch_gain);
+ *log_old_gain = gains[num_in_frames - 1];
+ PitchInterpolation(*old_lag, lags, pitch_lag_hz);
+ *old_lag = lags[num_in_frames - 1];
+
+ // Convert pitch-lags to Hertz.
+ for (int n = 0; n < num_out_frames; n++) {
+ pitch_lag_hz[n] = (sampling_rate_hz) / (pitch_lag_hz[n]);
+ }
+}
diff --git a/webrtc/modules/audio_processing/vad/pitch_internal.h b/webrtc/modules/audio_processing/vad/pitch_internal.h
new file mode 100644
index 0000000000..b25b1a82a2
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/pitch_internal.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_INTERNAL_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_INTERNAL_H_
+
+// TODO(turajs): Write a description of this function. Also be consistent with
+// usage of |sampling_rate_hz| vs |kSamplingFreqHz|.
+void GetSubframesPitchParameters(int sampling_rate_hz,
+ double* gains,
+ double* lags,
+ int num_in_frames,
+ int num_out_frames,
+ double* log_old_gain,
+ double* old_lag,
+ double* log_pitch_gain,
+ double* pitch_lag_hz);
+
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_INTERNAL_H_
diff --git a/webrtc/modules/audio_processing/vad/pitch_internal_unittest.cc b/webrtc/modules/audio_processing/vad/pitch_internal_unittest.cc
new file mode 100644
index 0000000000..8b5959d03e
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/pitch_internal_unittest.cc
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/pitch_internal.h"
+
+#include <math.h>
+
+#include "testing/gtest/include/gtest/gtest.h"
+
+TEST(PitchInternalTest, test) {
+ const int kSamplingRateHz = 8000;
+ const int kNumInputParameters = 4;
+ const int kNumOutputParameters = 3;
+ // Inputs
+ double log_old_gain = log(0.5);
+ double gains[] = {0.6, 0.2, 0.5, 0.4};
+
+ double old_lag = 70;
+ double lags[] = {90, 111, 122, 50};
+
+ // Expected outputs
+ double expected_log_pitch_gain[] = {
+ -0.541212549898316, -1.45672279045507, -0.80471895621705};
+ double expected_log_old_gain = log(gains[kNumInputParameters - 1]);
+
+ double expected_pitch_lag_hz[] = {
+ 92.3076923076923, 70.9010339734121, 93.0232558139535};
+ double expected_old_lag = lags[kNumInputParameters - 1];
+
+ double log_pitch_gain[kNumOutputParameters];
+ double pitch_lag_hz[kNumInputParameters];
+
+ GetSubframesPitchParameters(kSamplingRateHz, gains, lags, kNumInputParameters,
+ kNumOutputParameters, &log_old_gain, &old_lag,
+ log_pitch_gain, pitch_lag_hz);
+
+ for (int n = 0; n < 3; n++) {
+ EXPECT_NEAR(pitch_lag_hz[n], expected_pitch_lag_hz[n], 1e-6);
+ EXPECT_NEAR(log_pitch_gain[n], expected_log_pitch_gain[n], 1e-8);
+ }
+ EXPECT_NEAR(old_lag, expected_old_lag, 1e-6);
+ EXPECT_NEAR(log_old_gain, expected_log_old_gain, 1e-8);
+}
diff --git a/webrtc/modules/audio_processing/vad/pole_zero_filter.cc b/webrtc/modules/audio_processing/vad/pole_zero_filter.cc
new file mode 100644
index 0000000000..9769515c57
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/pole_zero_filter.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <algorithm>
+
+namespace webrtc {
+
+PoleZeroFilter* PoleZeroFilter::Create(const float* numerator_coefficients,
+ size_t order_numerator,
+ const float* denominator_coefficients,
+ size_t order_denominator) {
+ if (order_numerator > kMaxFilterOrder ||
+ order_denominator > kMaxFilterOrder || denominator_coefficients[0] == 0 ||
+ numerator_coefficients == NULL || denominator_coefficients == NULL)
+ return NULL;
+ return new PoleZeroFilter(numerator_coefficients, order_numerator,
+ denominator_coefficients, order_denominator);
+}
+
+PoleZeroFilter::PoleZeroFilter(const float* numerator_coefficients,
+ size_t order_numerator,
+ const float* denominator_coefficients,
+ size_t order_denominator)
+ : past_input_(),
+ past_output_(),
+ numerator_coefficients_(),
+ denominator_coefficients_(),
+ order_numerator_(order_numerator),
+ order_denominator_(order_denominator),
+ highest_order_(std::max(order_denominator, order_numerator)) {
+ memcpy(numerator_coefficients_, numerator_coefficients,
+ sizeof(numerator_coefficients_[0]) * (order_numerator_ + 1));
+ memcpy(denominator_coefficients_, denominator_coefficients,
+ sizeof(denominator_coefficients_[0]) * (order_denominator_ + 1));
+
+ if (denominator_coefficients_[0] != 1) {
+ for (size_t n = 0; n <= order_numerator_; n++)
+ numerator_coefficients_[n] /= denominator_coefficients_[0];
+ for (size_t n = 0; n <= order_denominator_; n++)
+ denominator_coefficients_[n] /= denominator_coefficients_[0];
+ }
+}
+
+template <typename T>
+static float FilterArPast(const T* past, size_t order,
+ const float* coefficients) {
+ float sum = 0.0f;
+ size_t past_index = order - 1;
+ for (size_t k = 1; k <= order; k++, past_index--)
+ sum += coefficients[k] * past[past_index];
+ return sum;
+}
+
+int PoleZeroFilter::Filter(const int16_t* in,
+ size_t num_input_samples,
+ float* output) {
+ if (in == NULL || output == NULL)
+ return -1;
+ // This is the typical case, just a memcpy.
+ const size_t k = std::min(num_input_samples, highest_order_);
+ size_t n;
+ for (n = 0; n < k; n++) {
+ output[n] = in[n] * numerator_coefficients_[0];
+ output[n] += FilterArPast(&past_input_[n], order_numerator_,
+ numerator_coefficients_);
+ output[n] -= FilterArPast(&past_output_[n], order_denominator_,
+ denominator_coefficients_);
+
+ past_input_[n + order_numerator_] = in[n];
+ past_output_[n + order_denominator_] = output[n];
+ }
+ if (highest_order_ < num_input_samples) {
+ for (size_t m = 0; n < num_input_samples; n++, m++) {
+ output[n] = in[n] * numerator_coefficients_[0];
+ output[n] +=
+ FilterArPast(&in[m], order_numerator_, numerator_coefficients_);
+ output[n] -= FilterArPast(&output[m], order_denominator_,
+ denominator_coefficients_);
+ }
+ // Record into the past signal.
+ memcpy(past_input_, &in[num_input_samples - order_numerator_],
+ sizeof(in[0]) * order_numerator_);
+ memcpy(past_output_, &output[num_input_samples - order_denominator_],
+ sizeof(output[0]) * order_denominator_);
+ } else {
+ // Odd case that the length of the input is shorter that filter order.
+ memmove(past_input_, &past_input_[num_input_samples],
+ order_numerator_ * sizeof(past_input_[0]));
+ memmove(past_output_, &past_output_[num_input_samples],
+ order_denominator_ * sizeof(past_output_[0]));
+ }
+ return 0;
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/pole_zero_filter.h b/webrtc/modules/audio_processing/vad/pole_zero_filter.h
new file mode 100644
index 0000000000..bd13050a5c
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/pole_zero_filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_POLE_ZERO_FILTER_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_POLE_ZERO_FILTER_H_
+
+#include <cstddef>
+
+#include "webrtc/typedefs.h"
+
+namespace webrtc {
+
+class PoleZeroFilter {
+ public:
+ ~PoleZeroFilter() {}
+
+ static PoleZeroFilter* Create(const float* numerator_coefficients,
+ size_t order_numerator,
+ const float* denominator_coefficients,
+ size_t order_denominator);
+
+ int Filter(const int16_t* in, size_t num_input_samples, float* output);
+
+ private:
+ PoleZeroFilter(const float* numerator_coefficients,
+ size_t order_numerator,
+ const float* denominator_coefficients,
+ size_t order_denominator);
+
+ static const int kMaxFilterOrder = 24;
+
+ int16_t past_input_[kMaxFilterOrder * 2];
+ float past_output_[kMaxFilterOrder * 2];
+
+ float numerator_coefficients_[kMaxFilterOrder + 1];
+ float denominator_coefficients_[kMaxFilterOrder + 1];
+
+ size_t order_numerator_;
+ size_t order_denominator_;
+ size_t highest_order_;
+};
+
+} // namespace webrtc
+
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_POLE_ZERO_FILTER_H_
diff --git a/webrtc/modules/audio_processing/vad/pole_zero_filter_unittest.cc b/webrtc/modules/audio_processing/vad/pole_zero_filter_unittest.cc
new file mode 100644
index 0000000000..492c3f0c94
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/pole_zero_filter_unittest.cc
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "webrtc/base/scoped_ptr.h"
+#include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"
+#include "webrtc/test/testsupport/fileutils.h"
+
+namespace webrtc {
+
+static const int kInputSamples = 50;
+
+static const int16_t kInput[kInputSamples] = {
+ -2136, -7116, 10715, 2464, 3164, 8139, 11393, 24013, -32117, -5544,
+ -27740, 10181, 14190, -24055, -15912, 17393, 6359, -9950, -13894, 32432,
+ -23944, 3437, -8381, 19768, 3087, -19795, -5920, 13310, 1407, 3876,
+ 4059, 3524, -23130, 19121, -27900, -24840, 4089, 21422, -3625, 3015,
+ -11236, 28856, 13424, 6571, -19761, -6361, 15821, -9469, 29727, 32229};
+
+static const float kReferenceOutput[kInputSamples] = {
+ -2082.230472f, -6878.572941f, 10697.090871f, 2358.373952f,
+ 2973.936512f, 7738.580650f, 10690.803213f, 22687.091576f,
+ -32676.684717f, -5879.621684f, -27359.297432f, 10368.735888f,
+ 13994.584604f, -23676.126249f, -15078.250390f, 17818.253338f,
+ 6577.743123f, -9498.369315f, -13073.651079f, 32460.026588f,
+ -23391.849347f, 3953.805667f, -7667.761363f, 19995.153447f,
+ 3185.575477f, -19207.365160f, -5143.103201f, 13756.317237f,
+ 1779.654794f, 4142.269755f, 4209.475034f, 3572.991789f,
+ -22509.089546f, 19307.878964f, -27060.439759f, -23319.042810f,
+ 5547.685267f, 22312.718676f, -2707.309027f, 3852.358490f,
+ -10135.510093f, 29241.509970f, 13394.397233f, 6340.721417f,
+ -19510.207905f, -5908.442086f, 15882.301634f, -9211.335255f,
+ 29253.056735f, 30874.443046f};
+
+class PoleZeroFilterTest : public ::testing::Test {
+ protected:
+ PoleZeroFilterTest()
+ : my_filter_(PoleZeroFilter::Create(kCoeffNumerator,
+ kFilterOrder,
+ kCoeffDenominator,
+ kFilterOrder)) {}
+
+ ~PoleZeroFilterTest() {}
+
+ void FilterSubframes(int num_subframes);
+
+ private:
+ void TestClean();
+ rtc::scoped_ptr<PoleZeroFilter> my_filter_;
+};
+
+void PoleZeroFilterTest::FilterSubframes(int num_subframes) {
+ float output[kInputSamples];
+ const int num_subframe_samples = kInputSamples / num_subframes;
+ EXPECT_EQ(num_subframe_samples * num_subframes, kInputSamples);
+
+ for (int n = 0; n < num_subframes; n++) {
+ my_filter_->Filter(&kInput[n * num_subframe_samples], num_subframe_samples,
+ &output[n * num_subframe_samples]);
+ }
+ for (int n = 0; n < kInputSamples; n++) {
+ EXPECT_NEAR(output[n], kReferenceOutput[n], 1);
+ }
+}
+
+TEST_F(PoleZeroFilterTest, OneSubframe) {
+ FilterSubframes(1);
+}
+
+TEST_F(PoleZeroFilterTest, TwoSubframes) {
+ FilterSubframes(2);
+}
+
+TEST_F(PoleZeroFilterTest, FiveSubframes) {
+ FilterSubframes(5);
+}
+
+TEST_F(PoleZeroFilterTest, TenSubframes) {
+ FilterSubframes(10);
+}
+
+TEST_F(PoleZeroFilterTest, TwentyFiveSubframes) {
+ FilterSubframes(25);
+}
+
+TEST_F(PoleZeroFilterTest, FiftySubframes) {
+ FilterSubframes(50);
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/standalone_vad.cc b/webrtc/modules/audio_processing/vad/standalone_vad.cc
new file mode 100644
index 0000000000..468b8ff3f0
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/standalone_vad.cc
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/standalone_vad.h"
+
+#include <assert.h>
+
+#include "webrtc/modules/interface/module_common_types.h"
+#include "webrtc/modules/utility/interface/audio_frame_operations.h"
+#include "webrtc/typedefs.h"
+
+namespace webrtc {
+
+static const int kDefaultStandaloneVadMode = 3;
+
+StandaloneVad::StandaloneVad(VadInst* vad)
+ : vad_(vad), buffer_(), index_(0), mode_(kDefaultStandaloneVadMode) {
+}
+
+StandaloneVad::~StandaloneVad() {
+ WebRtcVad_Free(vad_);
+}
+
+StandaloneVad* StandaloneVad::Create() {
+ VadInst* vad = WebRtcVad_Create();
+ if (!vad)
+ return nullptr;
+
+ int err = WebRtcVad_Init(vad);
+ err |= WebRtcVad_set_mode(vad, kDefaultStandaloneVadMode);
+ if (err != 0) {
+ WebRtcVad_Free(vad);
+ return nullptr;
+ }
+ return new StandaloneVad(vad);
+}
+
+int StandaloneVad::AddAudio(const int16_t* data, size_t length) {
+ if (length != kLength10Ms)
+ return -1;
+
+ if (index_ + length > kLength10Ms * kMaxNum10msFrames)
+ // Reset the buffer if it's full.
+ // TODO(ajm): Instead, consider just processing every 10 ms frame. Then we
+ // can forgo the buffering.
+ index_ = 0;
+
+ memcpy(&buffer_[index_], data, sizeof(int16_t) * length);
+ index_ += length;
+ return 0;
+}
+
+int StandaloneVad::GetActivity(double* p, size_t length_p) {
+ if (index_ == 0)
+ return -1;
+
+ const size_t num_frames = index_ / kLength10Ms;
+ if (num_frames > length_p)
+ return -1;
+ assert(WebRtcVad_ValidRateAndFrameLength(kSampleRateHz, index_) == 0);
+
+ int activity = WebRtcVad_Process(vad_, kSampleRateHz, buffer_, index_);
+ if (activity < 0)
+ return -1;
+ else if (activity == 0)
+ p[0] = 0.01; // Arbitrary but small and non-zero.
+ else
+ p[0] = 0.5; // 0.5 is neutral values when combinned by other probabilities.
+ for (size_t n = 1; n < num_frames; n++)
+ p[n] = p[0];
+ // Reset the buffer to start from the beginning.
+ index_ = 0;
+ return activity;
+}
+
+int StandaloneVad::set_mode(int mode) {
+ if (mode < 0 || mode > 3)
+ return -1;
+ if (WebRtcVad_set_mode(vad_, mode) != 0)
+ return -1;
+
+ mode_ = mode;
+ return 0;
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/standalone_vad.h b/webrtc/modules/audio_processing/vad/standalone_vad.h
new file mode 100644
index 0000000000..6a25424dab
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/standalone_vad.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_STANDALONE_VAD_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_STANDALONE_VAD_H_
+
+#include "webrtc/base/scoped_ptr.h"
+#include "webrtc/modules/audio_processing/vad/common.h"
+#include "webrtc/common_audio/vad/include/webrtc_vad.h"
+#include "webrtc/typedefs.h"
+
+namespace webrtc {
+
+class AudioFrame;
+
+class StandaloneVad {
+ public:
+ static StandaloneVad* Create();
+ ~StandaloneVad();
+
+ // Outputs
+ // p: a buffer where probabilities are written to.
+ // length_p: number of elements of |p|.
+ //
+ // return value:
+ // -1: if no audio is stored or VAD returns error.
+ // 0: in success.
+ // In case of error the content of |activity| is unchanged.
+ //
+ // Note that due to a high false-positive (VAD decision is active while the
+ // processed audio is just background noise) rate, stand-alone VAD is used as
+ // a one-sided indicator. The activity probability is 0.5 if the frame is
+ // classified as active, and the probability is 0.01 if the audio is
+ // classified as passive. In this way, when probabilities are combined, the
+ // effect of the stand-alone VAD is neutral if the input is classified as
+ // active.
+ int GetActivity(double* p, size_t length_p);
+
+ // Expecting 10 ms of 16 kHz audio to be pushed in.
+ int AddAudio(const int16_t* data, size_t length);
+
+ // Set aggressiveness of VAD, 0 is the least aggressive and 3 is the most
+ // aggressive mode. Returns -1 if the input is less than 0 or larger than 3,
+ // otherwise 0 is returned.
+ int set_mode(int mode);
+ // Get the agressiveness of the current VAD.
+ int mode() const { return mode_; }
+
+ private:
+ explicit StandaloneVad(VadInst* vad);
+
+ static const size_t kMaxNum10msFrames = 3;
+
+ // TODO(turajs): Is there a way to use scoped-pointer here?
+ VadInst* vad_;
+ int16_t buffer_[kMaxNum10msFrames * kLength10Ms];
+ size_t index_;
+ int mode_;
+};
+
+} // namespace webrtc
+
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_STANDALONE_VAD_H_
diff --git a/webrtc/modules/audio_processing/vad/standalone_vad_unittest.cc b/webrtc/modules/audio_processing/vad/standalone_vad_unittest.cc
new file mode 100644
index 0000000000..942008e733
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/standalone_vad_unittest.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/standalone_vad.h"
+
+#include <string.h>
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "webrtc/base/scoped_ptr.h"
+#include "webrtc/modules/interface/module_common_types.h"
+#include "webrtc/test/testsupport/fileutils.h"
+#include "webrtc/test/testsupport/gtest_disable.h"
+
+namespace webrtc {
+
+TEST(StandaloneVadTest, Api) {
+ rtc::scoped_ptr<StandaloneVad> vad(StandaloneVad::Create());
+ int16_t data[kLength10Ms] = {0};
+
+ // Valid frame length (for 32 kHz rate), but not what the VAD is expecting.
+ EXPECT_EQ(-1, vad->AddAudio(data, 320));
+
+ const size_t kMaxNumFrames = 3;
+ double p[kMaxNumFrames];
+ for (size_t n = 0; n < kMaxNumFrames; n++)
+ EXPECT_EQ(0, vad->AddAudio(data, kLength10Ms));
+
+ // Pretend |p| is shorter that it should be.
+ EXPECT_EQ(-1, vad->GetActivity(p, kMaxNumFrames - 1));
+
+ EXPECT_EQ(0, vad->GetActivity(p, kMaxNumFrames));
+
+ // Ask for activity when buffer is empty.
+ EXPECT_EQ(-1, vad->GetActivity(p, kMaxNumFrames));
+
+ // Should reset and result in one buffer.
+ for (size_t n = 0; n < kMaxNumFrames + 1; n++)
+ EXPECT_EQ(0, vad->AddAudio(data, kLength10Ms));
+ EXPECT_EQ(0, vad->GetActivity(p, 1));
+
+ // Wrong modes
+ EXPECT_EQ(-1, vad->set_mode(-1));
+ EXPECT_EQ(-1, vad->set_mode(4));
+
+ // Valid mode.
+ const int kMode = 2;
+ EXPECT_EQ(0, vad->set_mode(kMode));
+ EXPECT_EQ(kMode, vad->mode());
+}
+
+TEST(StandaloneVadTest, DISABLED_ON_IOS(ActivityDetection)) {
+ rtc::scoped_ptr<StandaloneVad> vad(StandaloneVad::Create());
+ const size_t kDataLength = kLength10Ms;
+ int16_t data[kDataLength] = {0};
+
+ FILE* pcm_file =
+ fopen(test::ResourcePath("audio_processing/agc/agc_audio", "pcm").c_str(),
+ "rb");
+ ASSERT_TRUE(pcm_file != NULL);
+
+ FILE* reference_file = fopen(
+ test::ResourcePath("audio_processing/agc/agc_vad", "dat").c_str(), "rb");
+ ASSERT_TRUE(reference_file != NULL);
+
+ // Reference activities are prepared with 0 aggressiveness.
+ ASSERT_EQ(0, vad->set_mode(0));
+
+ // Stand-alone VAD can operate on 1, 2 or 3 frames of length 10 ms. The
+ // reference file is created for 30 ms frame.
+ const int kNumVadFramesToProcess = 3;
+ int num_frames = 0;
+ while (fread(data, sizeof(int16_t), kDataLength, pcm_file) == kDataLength) {
+ vad->AddAudio(data, kDataLength);
+ num_frames++;
+ if (num_frames == kNumVadFramesToProcess) {
+ num_frames = 0;
+ int referece_activity;
+ double p[kNumVadFramesToProcess];
+ EXPECT_EQ(1u, fread(&referece_activity, sizeof(referece_activity), 1,
+ reference_file));
+ int activity = vad->GetActivity(p, kNumVadFramesToProcess);
+ EXPECT_EQ(referece_activity, activity);
+ if (activity != 0) {
+ // When active, probabilities are set to 0.5.
+ for (int n = 0; n < kNumVadFramesToProcess; n++)
+ EXPECT_EQ(0.5, p[n]);
+ } else {
+ // When inactive, probabilities are set to 0.01.
+ for (int n = 0; n < kNumVadFramesToProcess; n++)
+ EXPECT_EQ(0.01, p[n]);
+ }
+ }
+ }
+ fclose(reference_file);
+ fclose(pcm_file);
+}
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/vad_audio_proc.cc b/webrtc/modules/audio_processing/vad/vad_audio_proc.cc
new file mode 100644
index 0000000000..8535d1ff57
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/vad_audio_proc.cc
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include "webrtc/common_audio/fft4g.h"
+#include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h"
+#include "webrtc/modules/audio_processing/vad/pitch_internal.h"
+#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h"
+extern "C" {
+#include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
+#include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
+#include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
+#include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
+}
+#include "webrtc/modules/interface/module_common_types.h"
+
+namespace webrtc {
+
+// The following structures are declared anonymous in iSAC's structs.h. To
+// forward declare them, we use this derived class trick.
+struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
+struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
+
+static const float kFrequencyResolution =
+ kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
+static const int kSilenceRms = 5;
+
+// TODO(turajs): Make a Create or Init for VadAudioProc.
+VadAudioProc::VadAudioProc()
+ : audio_buffer_(),
+ num_buffer_samples_(kNumPastSignalSamples),
+ log_old_gain_(-2),
+ old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).
+ pitch_analysis_handle_(new PitchAnalysisStruct),
+ pre_filter_handle_(new PreFiltBankstr),
+ high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
+ kFilterOrder,
+ kCoeffDenominator,
+ kFilterOrder)) {
+ static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
+ sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
+ "lpc analysis window incorrect size");
+ static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
+ "correlation weight incorrect size");
+
+ // TODO(turajs): Are we doing too much in the constructor?
+ float data[kDftSize];
+ // Make FFT to initialize.
+ ip_[0] = 0;
+ WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
+ // TODO(turajs): Need to initialize high-pass filter.
+
+ // Initialize iSAC components.
+ WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
+ WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
+}
+
+VadAudioProc::~VadAudioProc() {
+}
+
+void VadAudioProc::ResetBuffer() {
+ memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
+ sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
+ num_buffer_samples_ = kNumPastSignalSamples;
+}
+
+int VadAudioProc::ExtractFeatures(const int16_t* frame,
+ size_t length,
+ AudioFeatures* features) {
+ features->num_frames = 0;
+ if (length != kNumSubframeSamples) {
+ return -1;
+ }
+
+ // High-pass filter to remove the DC component and very low frequency content.
+ // We have experienced that this high-pass filtering improves voice/non-voiced
+ // classification.
+ if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
+ &audio_buffer_[num_buffer_samples_]) != 0) {
+ return -1;
+ }
+
+ num_buffer_samples_ += kNumSubframeSamples;
+ if (num_buffer_samples_ < kBufferLength) {
+ return 0;
+ }
+ assert(num_buffer_samples_ == kBufferLength);
+ features->num_frames = kNum10msSubframes;
+ features->silence = false;
+
+ Rms(features->rms, kMaxNumFrames);
+ for (size_t i = 0; i < kNum10msSubframes; ++i) {
+ if (features->rms[i] < kSilenceRms) {
+ // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
+ // Bail out here instead.
+ features->silence = true;
+ ResetBuffer();
+ return 0;
+ }
+ }
+
+ PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
+ kMaxNumFrames);
+ FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
+ ResetBuffer();
+ return 0;
+}
+
+// Computes |kLpcOrder + 1| correlation coefficients.
+void VadAudioProc::SubframeCorrelation(double* corr,
+ size_t length_corr,
+ size_t subframe_index) {
+ assert(length_corr >= kLpcOrder + 1);
+ double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
+ size_t buffer_index = subframe_index * kNumSubframeSamples;
+
+ for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
+ windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
+
+ WebRtcIsac_AutoCorr(corr, windowed_audio,
+ kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
+}
+
+// Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
+// The analysis window is 15 ms long and it is centered on the first half of
+// each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
+// first half of each 10 ms subframe.
+void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {
+ assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));
+ double corr[kLpcOrder + 1];
+ double reflec_coeff[kLpcOrder];
+ for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;
+ i++, offset_lpc += kLpcOrder + 1) {
+ SubframeCorrelation(corr, kLpcOrder + 1, i);
+ corr[0] *= 1.0001;
+ // This makes Lev-Durb a bit more stable.
+ for (size_t k = 0; k < kLpcOrder + 1; k++) {
+ corr[k] *= kCorrWeight[k];
+ }
+ WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
+ }
+}
+
+// Fit a second order curve to these 3 points and find the location of the
+// extremum. The points are inverted before curve fitting.
+static float QuadraticInterpolation(float prev_val,
+ float curr_val,
+ float next_val) {
+ // Doing the interpolation in |1 / A(z)|^2.
+ float fractional_index = 0;
+ next_val = 1.0f / next_val;
+ prev_val = 1.0f / prev_val;
+ curr_val = 1.0f / curr_val;
+
+ fractional_index =
+ -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
+ assert(fabs(fractional_index) < 1);
+ return fractional_index;
+}
+
+// 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
+// of the input signal. The local maximum of the spectral envelope corresponds
+// with the local minimum of A(z). It saves complexity, as we save one
+// inversion. Furthermore, we find the first local maximum of magnitude squared,
+// to save on one square root.
+void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,
+ size_t length_f_peak) {
+ assert(length_f_peak >= kNum10msSubframes);
+ double lpc[kNum10msSubframes * (kLpcOrder + 1)];
+ // For all sub-frames.
+ GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
+
+ const size_t kNumDftCoefficients = kDftSize / 2 + 1;
+ float data[kDftSize];
+
+ for (size_t i = 0; i < kNum10msSubframes; i++) {
+ // Convert to float with zero pad.
+ memset(data, 0, sizeof(data));
+ for (size_t n = 0; n < kLpcOrder + 1; n++) {
+ data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
+ }
+ // Transform to frequency domain.
+ WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
+
+ size_t index_peak = 0;
+ float prev_magn_sqr = data[0] * data[0];
+ float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
+ float next_magn_sqr;
+ bool found_peak = false;
+ for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {
+ next_magn_sqr =
+ data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
+ if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
+ found_peak = true;
+ index_peak = n - 1;
+ break;
+ }
+ prev_magn_sqr = curr_magn_sqr;
+ curr_magn_sqr = next_magn_sqr;
+ }
+ float fractional_index = 0;
+ if (!found_peak) {
+ // Checking if |kNumDftCoefficients - 1| is the local minimum.
+ next_magn_sqr = data[1] * data[1];
+ if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
+ index_peak = kNumDftCoefficients - 1;
+ }
+ } else {
+ // A peak is found, do a simple quadratic interpolation to get a more
+ // accurate estimate of the peak location.
+ fractional_index =
+ QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
+ }
+ f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
+ }
+}
+
+// Using iSAC functions to estimate pitch gains & lags.
+void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
+ double* pitch_lags_hz,
+ size_t length) {
+ // TODO(turajs): This can be "imported" from iSAC & and the next two
+ // constants.
+ assert(length >= kNum10msSubframes);
+ const int kNumPitchSubframes = 4;
+ double gains[kNumPitchSubframes];
+ double lags[kNumPitchSubframes];
+
+ const int kNumSubbandFrameSamples = 240;
+ const int kNumLookaheadSamples = 24;
+
+ float lower[kNumSubbandFrameSamples];
+ float upper[kNumSubbandFrameSamples];
+ double lower_lookahead[kNumSubbandFrameSamples];
+ double upper_lookahead[kNumSubbandFrameSamples];
+ double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
+ kNumLookaheadSamples];
+
+ // Split signal to lower and upper bands
+ WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
+ upper, lower_lookahead, upper_lookahead,
+ pre_filter_handle_.get());
+ WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
+ pitch_analysis_handle_.get(), lags, gains);
+
+ // Lags are computed on lower-band signal with sampling rate half of the
+ // input signal.
+ GetSubframesPitchParameters(
+ kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
+ &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
+}
+
+void VadAudioProc::Rms(double* rms, size_t length_rms) {
+ assert(length_rms >= kNum10msSubframes);
+ size_t offset = kNumPastSignalSamples;
+ for (size_t i = 0; i < kNum10msSubframes; i++) {
+ rms[i] = 0;
+ for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)
+ rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
+ rms[i] = sqrt(rms[i] / kNumSubframeSamples);
+ }
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/vad_audio_proc.h b/webrtc/modules/audio_processing/vad/vad_audio_proc.h
new file mode 100644
index 0000000000..85500aed84
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/vad_audio_proc.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_H_
+
+#include "webrtc/base/scoped_ptr.h"
+#include "webrtc/modules/audio_processing/vad/common.h"
+#include "webrtc/typedefs.h"
+
+namespace webrtc {
+
+class AudioFrame;
+class PoleZeroFilter;
+
+class VadAudioProc {
+ public:
+ // Forward declare iSAC structs.
+ struct PitchAnalysisStruct;
+ struct PreFiltBankstr;
+
+ VadAudioProc();
+ ~VadAudioProc();
+
+ int ExtractFeatures(const int16_t* audio_frame,
+ size_t length,
+ AudioFeatures* audio_features);
+
+ static const size_t kDftSize = 512;
+
+ private:
+ void PitchAnalysis(double* pitch_gains, double* pitch_lags_hz, size_t length);
+ void SubframeCorrelation(double* corr,
+ size_t length_corr,
+ size_t subframe_index);
+ void GetLpcPolynomials(double* lpc, size_t length_lpc);
+ void FindFirstSpectralPeaks(double* f_peak, size_t length_f_peak);
+ void Rms(double* rms, size_t length_rms);
+ void ResetBuffer();
+
+ // To compute spectral peak we perform LPC analysis to get spectral envelope.
+ // For every 30 ms we compute 3 spectral peak there for 3 LPC analysis.
+ // LPC is computed over 15 ms of windowed audio. For every 10 ms sub-frame
+ // we need 5 ms of past signal to create the input of LPC analysis.
+ static const size_t kNumPastSignalSamples =
+ static_cast<size_t>(kSampleRateHz / 200);
+
+ // TODO(turajs): maybe defining this at a higher level (maybe enum) so that
+ // all the code recognize it as "no-error."
+ static const int kNoError = 0;
+
+ static const size_t kNum10msSubframes = 3;
+ static const size_t kNumSubframeSamples =
+ static_cast<size_t>(kSampleRateHz / 100);
+ static const size_t kNumSamplesToProcess =
+ kNum10msSubframes *
+ kNumSubframeSamples; // Samples in 30 ms @ given sampling rate.
+ static const size_t kBufferLength =
+ kNumPastSignalSamples + kNumSamplesToProcess;
+ static const size_t kIpLength = kDftSize >> 1;
+ static const size_t kWLength = kDftSize >> 1;
+
+ static const size_t kLpcOrder = 16;
+
+ size_t ip_[kIpLength];
+ float w_fft_[kWLength];
+
+ // A buffer of 5 ms (past audio) + 30 ms (one iSAC frame ).
+ float audio_buffer_[kBufferLength];
+ size_t num_buffer_samples_;
+
+ double log_old_gain_;
+ double old_lag_;
+
+ rtc::scoped_ptr<PitchAnalysisStruct> pitch_analysis_handle_;
+ rtc::scoped_ptr<PreFiltBankstr> pre_filter_handle_;
+ rtc::scoped_ptr<PoleZeroFilter> high_pass_filter_;
+};
+
+} // namespace webrtc
+
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_H_
diff --git a/webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h b/webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h
new file mode 100644
index 0000000000..45586b9be6
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_INTERNAL_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_INTERNAL_H_
+
+namespace webrtc {
+
+// These values should match MATLAB counterparts for unit-tests to pass.
+static const double kCorrWeight[] = {1.000000,
+ 0.985000,
+ 0.970225,
+ 0.955672,
+ 0.941337,
+ 0.927217,
+ 0.913308,
+ 0.899609,
+ 0.886115,
+ 0.872823,
+ 0.859730,
+ 0.846834,
+ 0.834132,
+ 0.821620,
+ 0.809296,
+ 0.797156,
+ 0.785199};
+
+static const double kLpcAnalWin[] = {
+ 0.00000000, 0.01314436, 0.02628645, 0.03942400, 0.05255473, 0.06567639,
+ 0.07878670, 0.09188339, 0.10496421, 0.11802689, 0.13106918, 0.14408883,
+ 0.15708358, 0.17005118, 0.18298941, 0.19589602, 0.20876878, 0.22160547,
+ 0.23440387, 0.24716177, 0.25987696, 0.27254725, 0.28517045, 0.29774438,
+ 0.31026687, 0.32273574, 0.33514885, 0.34750406, 0.35979922, 0.37203222,
+ 0.38420093, 0.39630327, 0.40833713, 0.42030043, 0.43219112, 0.44400713,
+ 0.45574642, 0.46740697, 0.47898676, 0.49048379, 0.50189608, 0.51322164,
+ 0.52445853, 0.53560481, 0.54665854, 0.55761782, 0.56848075, 0.57924546,
+ 0.58991008, 0.60047278, 0.61093173, 0.62128512, 0.63153117, 0.64166810,
+ 0.65169416, 0.66160761, 0.67140676, 0.68108990, 0.69065536, 0.70010148,
+ 0.70942664, 0.71862923, 0.72770765, 0.73666033, 0.74548573, 0.75418233,
+ 0.76274862, 0.77118312, 0.77948437, 0.78765094, 0.79568142, 0.80357442,
+ 0.81132858, 0.81894256, 0.82641504, 0.83374472, 0.84093036, 0.84797069,
+ 0.85486451, 0.86161063, 0.86820787, 0.87465511, 0.88095122, 0.88709512,
+ 0.89308574, 0.89892206, 0.90460306, 0.91012776, 0.91549520, 0.92070447,
+ 0.92575465, 0.93064488, 0.93537432, 0.93994213, 0.94434755, 0.94858979,
+ 0.95266814, 0.95658189, 0.96033035, 0.96391289, 0.96732888, 0.97057773,
+ 0.97365889, 0.97657181, 0.97931600, 0.98189099, 0.98429632, 0.98653158,
+ 0.98859639, 0.99049038, 0.99221324, 0.99376466, 0.99514438, 0.99635215,
+ 0.99738778, 0.99825107, 0.99894188, 0.99946010, 0.99980562, 0.99997840,
+ 0.99997840, 0.99980562, 0.99946010, 0.99894188, 0.99825107, 0.99738778,
+ 0.99635215, 0.99514438, 0.99376466, 0.99221324, 0.99049038, 0.98859639,
+ 0.98653158, 0.98429632, 0.98189099, 0.97931600, 0.97657181, 0.97365889,
+ 0.97057773, 0.96732888, 0.96391289, 0.96033035, 0.95658189, 0.95266814,
+ 0.94858979, 0.94434755, 0.93994213, 0.93537432, 0.93064488, 0.92575465,
+ 0.92070447, 0.91549520, 0.91012776, 0.90460306, 0.89892206, 0.89308574,
+ 0.88709512, 0.88095122, 0.87465511, 0.86820787, 0.86161063, 0.85486451,
+ 0.84797069, 0.84093036, 0.83374472, 0.82641504, 0.81894256, 0.81132858,
+ 0.80357442, 0.79568142, 0.78765094, 0.77948437, 0.77118312, 0.76274862,
+ 0.75418233, 0.74548573, 0.73666033, 0.72770765, 0.71862923, 0.70942664,
+ 0.70010148, 0.69065536, 0.68108990, 0.67140676, 0.66160761, 0.65169416,
+ 0.64166810, 0.63153117, 0.62128512, 0.61093173, 0.60047278, 0.58991008,
+ 0.57924546, 0.56848075, 0.55761782, 0.54665854, 0.53560481, 0.52445853,
+ 0.51322164, 0.50189608, 0.49048379, 0.47898676, 0.46740697, 0.45574642,
+ 0.44400713, 0.43219112, 0.42030043, 0.40833713, 0.39630327, 0.38420093,
+ 0.37203222, 0.35979922, 0.34750406, 0.33514885, 0.32273574, 0.31026687,
+ 0.29774438, 0.28517045, 0.27254725, 0.25987696, 0.24716177, 0.23440387,
+ 0.22160547, 0.20876878, 0.19589602, 0.18298941, 0.17005118, 0.15708358,
+ 0.14408883, 0.13106918, 0.11802689, 0.10496421, 0.09188339, 0.07878670,
+ 0.06567639, 0.05255473, 0.03942400, 0.02628645, 0.01314436, 0.00000000};
+
+static const size_t kFilterOrder = 2;
+static const float kCoeffNumerator[kFilterOrder + 1] = {0.974827f,
+ -1.949650f,
+ 0.974827f};
+static const float kCoeffDenominator[kFilterOrder + 1] = {1.0f,
+ -1.971999f,
+ 0.972457f};
+
+static_assert(kFilterOrder + 1 ==
+ sizeof(kCoeffNumerator) / sizeof(kCoeffNumerator[0]),
+ "numerator coefficients incorrect size");
+static_assert(kFilterOrder + 1 ==
+ sizeof(kCoeffDenominator) / sizeof(kCoeffDenominator[0]),
+ "denominator coefficients incorrect size");
+
+} // namespace webrtc
+
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROCESSING_H_
diff --git a/webrtc/modules/audio_processing/vad/vad_audio_proc_unittest.cc b/webrtc/modules/audio_processing/vad/vad_audio_proc_unittest.cc
new file mode 100644
index 0000000000..f509af476f
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/vad_audio_proc_unittest.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// We don't test the value of pitch gain and lags as they are created by iSAC
+// routines. However, interpolation of pitch-gain and lags is in a separate
+// class and has its own unit-test.
+
+#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include <string>
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "webrtc/modules/audio_processing/vad/common.h"
+#include "webrtc/modules/interface/module_common_types.h"
+#include "webrtc/test/testsupport/fileutils.h"
+
+namespace webrtc {
+
+TEST(AudioProcessingTest, DISABLED_ComputingFirstSpectralPeak) {
+ VadAudioProc audioproc;
+
+ std::string peak_file_name =
+ test::ResourcePath("audio_processing/agc/agc_spectral_peak", "dat");
+ FILE* peak_file = fopen(peak_file_name.c_str(), "rb");
+ ASSERT_TRUE(peak_file != NULL);
+
+ std::string pcm_file_name =
+ test::ResourcePath("audio_processing/agc/agc_audio", "pcm");
+ FILE* pcm_file = fopen(pcm_file_name.c_str(), "rb");
+ ASSERT_TRUE(pcm_file != NULL);
+
+ // Read 10 ms audio in each iteration.
+ const size_t kDataLength = kLength10Ms;
+ int16_t data[kDataLength] = {0};
+ AudioFeatures features;
+ double sp[kMaxNumFrames];
+ while (fread(data, sizeof(int16_t), kDataLength, pcm_file) == kDataLength) {
+ audioproc.ExtractFeatures(data, kDataLength, &features);
+ if (features.num_frames > 0) {
+ ASSERT_LT(features.num_frames, kMaxNumFrames);
+ // Read reference values.
+ const size_t num_frames = features.num_frames;
+ ASSERT_EQ(num_frames, fread(sp, sizeof(sp[0]), num_frames, peak_file));
+ for (size_t n = 0; n < features.num_frames; n++)
+ EXPECT_NEAR(features.spectral_peak[n], sp[n], 3);
+ }
+ }
+
+ fclose(peak_file);
+ fclose(pcm_file);
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/vad_circular_buffer.cc b/webrtc/modules/audio_processing/vad/vad_circular_buffer.cc
new file mode 100644
index 0000000000..d337893c45
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/vad_circular_buffer.cc
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+namespace webrtc {
+
+VadCircularBuffer::VadCircularBuffer(int buffer_size)
+ : buffer_(new double[buffer_size]),
+ is_full_(false),
+ index_(0),
+ buffer_size_(buffer_size),
+ sum_(0) {
+}
+
+VadCircularBuffer::~VadCircularBuffer() {
+}
+
+void VadCircularBuffer::Reset() {
+ is_full_ = false;
+ index_ = 0;
+ sum_ = 0;
+}
+
+VadCircularBuffer* VadCircularBuffer::Create(int buffer_size) {
+ if (buffer_size <= 0)
+ return NULL;
+ return new VadCircularBuffer(buffer_size);
+}
+
+double VadCircularBuffer::Oldest() const {
+ if (!is_full_)
+ return buffer_[0];
+ else
+ return buffer_[index_];
+}
+
+double VadCircularBuffer::Mean() {
+ double m;
+ if (is_full_) {
+ m = sum_ / buffer_size_;
+ } else {
+ if (index_ > 0)
+ m = sum_ / index_;
+ else
+ m = 0;
+ }
+ return m;
+}
+
+void VadCircularBuffer::Insert(double value) {
+ if (is_full_) {
+ sum_ -= buffer_[index_];
+ }
+ sum_ += value;
+ buffer_[index_] = value;
+ index_++;
+ if (index_ >= buffer_size_) {
+ is_full_ = true;
+ index_ = 0;
+ }
+}
+int VadCircularBuffer::BufferLevel() {
+ if (is_full_)
+ return buffer_size_;
+ return index_;
+}
+
+int VadCircularBuffer::Get(int index, double* value) const {
+ int err = ConvertToLinearIndex(&index);
+ if (err < 0)
+ return -1;
+ *value = buffer_[index];
+ return 0;
+}
+
+int VadCircularBuffer::Set(int index, double value) {
+ int err = ConvertToLinearIndex(&index);
+ if (err < 0)
+ return -1;
+
+ sum_ -= buffer_[index];
+ buffer_[index] = value;
+ sum_ += value;
+ return 0;
+}
+
+int VadCircularBuffer::ConvertToLinearIndex(int* index) const {
+ if (*index < 0 || *index >= buffer_size_)
+ return -1;
+
+ if (!is_full_ && *index >= index_)
+ return -1;
+
+ *index = index_ - 1 - *index;
+ if (*index < 0)
+ *index += buffer_size_;
+ return 0;
+}
+
+int VadCircularBuffer::RemoveTransient(int width_threshold,
+ double val_threshold) {
+ if (!is_full_ && index_ < width_threshold + 2)
+ return 0;
+
+ int index_1 = 0;
+ int index_2 = width_threshold + 1;
+ double v = 0;
+ if (Get(index_1, &v) < 0)
+ return -1;
+ if (v < val_threshold) {
+ Set(index_1, 0);
+ int index;
+ for (index = index_2; index > index_1; index--) {
+ if (Get(index, &v) < 0)
+ return -1;
+ if (v < val_threshold)
+ break;
+ }
+ for (; index > index_1; index--) {
+ if (Set(index, 0.0) < 0)
+ return -1;
+ }
+ }
+ return 0;
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/vad_circular_buffer.h b/webrtc/modules/audio_processing/vad/vad_circular_buffer.h
new file mode 100644
index 0000000000..5238f77257
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/vad_circular_buffer.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_CIRCULAR_BUFFER_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_CIRCULAR_BUFFER_H_
+
+#include "webrtc/base/scoped_ptr.h"
+
+namespace webrtc {
+
+// A circular buffer tailored to the need of this project. It stores last
+// K samples of the input, and keeps track of the mean of the last samples.
+//
+// It is used in class "PitchBasedActivity" to keep track of posterior
+// probabilities in the past few seconds. The posterior probabilities are used
+// to recursively update prior probabilities.
+class VadCircularBuffer {
+ public:
+ static VadCircularBuffer* Create(int buffer_size);
+ ~VadCircularBuffer();
+
+ // If buffer is wrapped around.
+ bool is_full() const { return is_full_; }
+ // Get the oldest entry in the buffer.
+ double Oldest() const;
+ // Insert new value into the buffer.
+ void Insert(double value);
+ // Reset buffer, forget the past, start fresh.
+ void Reset();
+
+ // The mean value of the elements in the buffer. The return value is zero if
+ // buffer is empty, i.e. no value is inserted.
+ double Mean();
+ // Remove transients. If the values exceed |val_threshold| for a period
+ // shorter then or equal to |width_threshold|, then that period is considered
+ // transient and set to zero.
+ int RemoveTransient(int width_threshold, double val_threshold);
+
+ private:
+ explicit VadCircularBuffer(int buffer_size);
+ // Get previous values. |index = 0| corresponds to the most recent
+ // insertion. |index = 1| is the one before the most recent insertion, and
+ // so on.
+ int Get(int index, double* value) const;
+ // Set a given position to |value|. |index| is interpreted as above.
+ int Set(int index, double value);
+ // Return the number of valid elements in the buffer.
+ int BufferLevel();
+
+ // Convert an index with the interpretation as get() method to the
+ // corresponding linear index.
+ int ConvertToLinearIndex(int* index) const;
+
+ rtc::scoped_ptr<double[]> buffer_;
+ bool is_full_;
+ int index_;
+ int buffer_size_;
+ double sum_;
+};
+
+} // namespace webrtc
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_CIRCULAR_BUFFER_H_
diff --git a/webrtc/modules/audio_processing/vad/vad_circular_buffer_unittest.cc b/webrtc/modules/audio_processing/vad/vad_circular_buffer_unittest.cc
new file mode 100644
index 0000000000..11945e042c
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/vad_circular_buffer_unittest.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h"
+
+#include <stdio.h>
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "webrtc/base/scoped_ptr.h"
+
+namespace webrtc {
+
+static const int kWidthThreshold = 7;
+static const double kValThreshold = 1.0;
+static const int kLongBuffSize = 100;
+static const int kShortBuffSize = 10;
+
+static void InsertSequentially(int k, VadCircularBuffer* circular_buffer) {
+ double mean_val;
+ for (int n = 1; n <= k; n++) {
+ EXPECT_TRUE(!circular_buffer->is_full());
+ circular_buffer->Insert(n);
+ mean_val = circular_buffer->Mean();
+ EXPECT_EQ((n + 1.0) / 2., mean_val);
+ }
+}
+
+static void Insert(double value,
+ int num_insertion,
+ VadCircularBuffer* circular_buffer) {
+ for (int n = 0; n < num_insertion; n++)
+ circular_buffer->Insert(value);
+}
+
+static void InsertZeros(int num_zeros, VadCircularBuffer* circular_buffer) {
+ Insert(0.0, num_zeros, circular_buffer);
+}
+
+TEST(VadCircularBufferTest, GeneralTest) {
+ rtc::scoped_ptr<VadCircularBuffer> circular_buffer(
+ VadCircularBuffer::Create(kShortBuffSize));
+ double mean_val;
+
+ // Mean should return zero if nothing is inserted.
+ mean_val = circular_buffer->Mean();
+ EXPECT_DOUBLE_EQ(0.0, mean_val);
+ InsertSequentially(kShortBuffSize, circular_buffer.get());
+
+ // Should be full.
+ EXPECT_TRUE(circular_buffer->is_full());
+ // Correct update after being full.
+ for (int n = 1; n < kShortBuffSize; n++) {
+ circular_buffer->Insert(n);
+ mean_val = circular_buffer->Mean();
+ EXPECT_DOUBLE_EQ((kShortBuffSize + 1.) / 2., mean_val);
+ EXPECT_TRUE(circular_buffer->is_full());
+ }
+
+ // Check reset. This should be like starting fresh.
+ circular_buffer->Reset();
+ mean_val = circular_buffer->Mean();
+ EXPECT_DOUBLE_EQ(0, mean_val);
+ InsertSequentially(kShortBuffSize, circular_buffer.get());
+ EXPECT_TRUE(circular_buffer->is_full());
+}
+
+TEST(VadCircularBufferTest, TransientsRemoval) {
+ rtc::scoped_ptr<VadCircularBuffer> circular_buffer(
+ VadCircularBuffer::Create(kLongBuffSize));
+ // Let the first transient be in wrap-around.
+ InsertZeros(kLongBuffSize - kWidthThreshold / 2, circular_buffer.get());
+
+ double push_val = kValThreshold;
+ double mean_val;
+ for (int k = kWidthThreshold; k >= 1; k--) {
+ Insert(push_val, k, circular_buffer.get());
+ circular_buffer->Insert(0);
+ mean_val = circular_buffer->Mean();
+ EXPECT_DOUBLE_EQ(k * push_val / kLongBuffSize, mean_val);
+ circular_buffer->RemoveTransient(kWidthThreshold, kValThreshold);
+ mean_val = circular_buffer->Mean();
+ EXPECT_DOUBLE_EQ(0, mean_val);
+ }
+}
+
+TEST(VadCircularBufferTest, TransientDetection) {
+ rtc::scoped_ptr<VadCircularBuffer> circular_buffer(
+ VadCircularBuffer::Create(kLongBuffSize));
+ // Let the first transient be in wrap-around.
+ int num_insertion = kLongBuffSize - kWidthThreshold / 2;
+ InsertZeros(num_insertion, circular_buffer.get());
+
+ double push_val = 2;
+ // This is longer than a transient and shouldn't be removed.
+ int num_non_zero_elements = kWidthThreshold + 1;
+ Insert(push_val, num_non_zero_elements, circular_buffer.get());
+
+ double mean_val = circular_buffer->Mean();
+ EXPECT_DOUBLE_EQ(num_non_zero_elements * push_val / kLongBuffSize, mean_val);
+ circular_buffer->Insert(0);
+ EXPECT_EQ(0,
+ circular_buffer->RemoveTransient(kWidthThreshold, kValThreshold));
+ mean_val = circular_buffer->Mean();
+ EXPECT_DOUBLE_EQ(num_non_zero_elements * push_val / kLongBuffSize, mean_val);
+
+ // A transient right after a non-transient, should be removed and mean is
+ // not changed.
+ num_insertion = 3;
+ Insert(push_val, num_insertion, circular_buffer.get());
+ circular_buffer->Insert(0);
+ EXPECT_EQ(0,
+ circular_buffer->RemoveTransient(kWidthThreshold, kValThreshold));
+ mean_val = circular_buffer->Mean();
+ EXPECT_DOUBLE_EQ(num_non_zero_elements * push_val / kLongBuffSize, mean_val);
+
+ // Last input is larger than threshold, although the sequence is short but
+ // it shouldn't be considered transient.
+ Insert(push_val, num_insertion, circular_buffer.get());
+ num_non_zero_elements += num_insertion;
+ EXPECT_EQ(0,
+ circular_buffer->RemoveTransient(kWidthThreshold, kValThreshold));
+ mean_val = circular_buffer->Mean();
+ EXPECT_DOUBLE_EQ(num_non_zero_elements * push_val / kLongBuffSize, mean_val);
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/voice_activity_detector.cc b/webrtc/modules/audio_processing/vad/voice_activity_detector.cc
new file mode 100644
index 0000000000..ef56a3574c
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/voice_activity_detector.cc
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
+
+#include <algorithm>
+
+#include "webrtc/base/checks.h"
+
+namespace webrtc {
+namespace {
+
+const size_t kMaxLength = 320;
+const int kNumChannels = 1;
+
+const double kDefaultVoiceValue = 1.0;
+const double kNeutralProbability = 0.5;
+const double kLowProbability = 0.01;
+
+} // namespace
+
+VoiceActivityDetector::VoiceActivityDetector()
+ : last_voice_probability_(kDefaultVoiceValue),
+ standalone_vad_(StandaloneVad::Create()) {
+}
+
+// Because ISAC has a different chunk length, it updates
+// |chunkwise_voice_probabilities_| and |chunkwise_rms_| when there is new data.
+// Otherwise it clears them.
+void VoiceActivityDetector::ProcessChunk(const int16_t* audio,
+ size_t length,
+ int sample_rate_hz) {
+ RTC_DCHECK_EQ(static_cast<int>(length), sample_rate_hz / 100);
+ RTC_DCHECK_LE(length, kMaxLength);
+ // Resample to the required rate.
+ const int16_t* resampled_ptr = audio;
+ if (sample_rate_hz != kSampleRateHz) {
+ RTC_CHECK_EQ(
+ resampler_.ResetIfNeeded(sample_rate_hz, kSampleRateHz, kNumChannels),
+ 0);
+ resampler_.Push(audio, length, resampled_, kLength10Ms, length);
+ resampled_ptr = resampled_;
+ }
+ RTC_DCHECK_EQ(length, kLength10Ms);
+
+ // Each chunk needs to be passed into |standalone_vad_|, because internally it
+ // buffers the audio and processes it all at once when GetActivity() is
+ // called.
+ RTC_CHECK_EQ(standalone_vad_->AddAudio(resampled_ptr, length), 0);
+
+ audio_processing_.ExtractFeatures(resampled_ptr, length, &features_);
+
+ chunkwise_voice_probabilities_.resize(features_.num_frames);
+ chunkwise_rms_.resize(features_.num_frames);
+ std::copy(features_.rms, features_.rms + chunkwise_rms_.size(),
+ chunkwise_rms_.begin());
+ if (features_.num_frames > 0) {
+ if (features_.silence) {
+ // The other features are invalid, so set the voice probabilities to an
+ // arbitrary low value.
+ std::fill(chunkwise_voice_probabilities_.begin(),
+ chunkwise_voice_probabilities_.end(), kLowProbability);
+ } else {
+ std::fill(chunkwise_voice_probabilities_.begin(),
+ chunkwise_voice_probabilities_.end(), kNeutralProbability);
+ RTC_CHECK_GE(
+ standalone_vad_->GetActivity(&chunkwise_voice_probabilities_[0],
+ chunkwise_voice_probabilities_.size()),
+ 0);
+ RTC_CHECK_GE(pitch_based_vad_.VoicingProbability(
+ features_, &chunkwise_voice_probabilities_[0]),
+ 0);
+ }
+ last_voice_probability_ = chunkwise_voice_probabilities_.back();
+ }
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/voice_activity_detector.h b/webrtc/modules/audio_processing/vad/voice_activity_detector.h
new file mode 100644
index 0000000000..e2dcf022a9
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/voice_activity_detector.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_ACTIVITY_DETECTOR_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_ACTIVITY_DETECTOR_H_
+
+#include <vector>
+
+#include "webrtc/base/scoped_ptr.h"
+#include "webrtc/common_audio/resampler/include/resampler.h"
+#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
+#include "webrtc/modules/audio_processing/vad/common.h"
+#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"
+#include "webrtc/modules/audio_processing/vad/standalone_vad.h"
+
+namespace webrtc {
+
+// A Voice Activity Detector (VAD) that combines the voice probability from the
+// StandaloneVad and PitchBasedVad to get a more robust estimation.
+class VoiceActivityDetector {
+ public:
+ VoiceActivityDetector();
+
+ // Processes each audio chunk and estimates the voice probability. The maximum
+ // supported sample rate is 32kHz.
+ // TODO(aluebs): Change |length| to size_t.
+ void ProcessChunk(const int16_t* audio, size_t length, int sample_rate_hz);
+
+ // Returns a vector of voice probabilities for each chunk. It can be empty for
+ // some chunks, but it catches up afterwards returning multiple values at
+ // once.
+ const std::vector<double>& chunkwise_voice_probabilities() const {
+ return chunkwise_voice_probabilities_;
+ }
+
+ // Returns a vector of RMS values for each chunk. It has the same length as
+ // chunkwise_voice_probabilities().
+ const std::vector<double>& chunkwise_rms() const { return chunkwise_rms_; }
+
+ // Returns the last voice probability, regardless of the internal
+ // implementation, although it has a few chunks of delay.
+ float last_voice_probability() const { return last_voice_probability_; }
+
+ private:
+ // TODO(aluebs): Change these to float.
+ std::vector<double> chunkwise_voice_probabilities_;
+ std::vector<double> chunkwise_rms_;
+
+ float last_voice_probability_;
+
+ Resampler resampler_;
+ VadAudioProc audio_processing_;
+
+ rtc::scoped_ptr<StandaloneVad> standalone_vad_;
+ PitchBasedVad pitch_based_vad_;
+
+ int16_t resampled_[kLength10Ms];
+ AudioFeatures features_;
+};
+
+} // namespace webrtc
+
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_ACTIVITY_DETECTOR_H_
diff --git a/webrtc/modules/audio_processing/vad/voice_activity_detector_unittest.cc b/webrtc/modules/audio_processing/vad/voice_activity_detector_unittest.cc
new file mode 100644
index 0000000000..f4ee17760e
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/voice_activity_detector_unittest.cc
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "webrtc/test/testsupport/fileutils.h"
+
+namespace webrtc {
+namespace {
+
+const int kStartTimeSec = 16;
+const float kMeanSpeechProbability = 0.3f;
+const float kMaxNoiseProbability = 0.1f;
+const size_t kNumChunks = 300u;
+const size_t kNumChunksPerIsacBlock = 3;
+
+void GenerateNoise(std::vector<int16_t>* data) {
+ for (size_t i = 0; i < data->size(); ++i) {
+ // std::rand returns between 0 and RAND_MAX, but this will work because it
+ // wraps into some random place.
+ (*data)[i] = std::rand();
+ }
+}
+
+} // namespace
+
+TEST(VoiceActivityDetectorTest, ConstructorSetsDefaultValues) {
+ const float kDefaultVoiceValue = 1.f;
+
+ VoiceActivityDetector vad;
+
+ std::vector<double> p = vad.chunkwise_voice_probabilities();
+ std::vector<double> rms = vad.chunkwise_rms();
+
+ EXPECT_EQ(p.size(), 0u);
+ EXPECT_EQ(rms.size(), 0u);
+
+ EXPECT_FLOAT_EQ(vad.last_voice_probability(), kDefaultVoiceValue);
+}
+
+TEST(VoiceActivityDetectorTest, Speech16kHzHasHighVoiceProbabilities) {
+ const int kSampleRateHz = 16000;
+ const int kLength10Ms = kSampleRateHz / 100;
+
+ VoiceActivityDetector vad;
+
+ std::vector<int16_t> data(kLength10Ms);
+ float mean_probability = 0.f;
+
+ FILE* pcm_file =
+ fopen(test::ResourcePath("audio_processing/transient/audio16kHz", "pcm")
+ .c_str(),
+ "rb");
+ ASSERT_TRUE(pcm_file != nullptr);
+ // The silences in the file are skipped to get a more robust voice probability
+ // for speech.
+ ASSERT_EQ(fseek(pcm_file, kStartTimeSec * kSampleRateHz * sizeof(data[0]),
+ SEEK_SET),
+ 0);
+
+ size_t num_chunks = 0;
+ while (fread(&data[0], sizeof(data[0]), data.size(), pcm_file) ==
+ data.size()) {
+ vad.ProcessChunk(&data[0], data.size(), kSampleRateHz);
+
+ mean_probability += vad.last_voice_probability();
+
+ ++num_chunks;
+ }
+
+ mean_probability /= num_chunks;
+
+ EXPECT_GT(mean_probability, kMeanSpeechProbability);
+}
+
+TEST(VoiceActivityDetectorTest, Speech32kHzHasHighVoiceProbabilities) {
+ const int kSampleRateHz = 32000;
+ const int kLength10Ms = kSampleRateHz / 100;
+
+ VoiceActivityDetector vad;
+
+ std::vector<int16_t> data(kLength10Ms);
+ float mean_probability = 0.f;
+
+ FILE* pcm_file =
+ fopen(test::ResourcePath("audio_processing/transient/audio32kHz", "pcm")
+ .c_str(),
+ "rb");
+ ASSERT_TRUE(pcm_file != nullptr);
+ // The silences in the file are skipped to get a more robust voice probability
+ // for speech.
+ ASSERT_EQ(fseek(pcm_file, kStartTimeSec * kSampleRateHz * sizeof(data[0]),
+ SEEK_SET),
+ 0);
+
+ size_t num_chunks = 0;
+ while (fread(&data[0], sizeof(data[0]), data.size(), pcm_file) ==
+ data.size()) {
+ vad.ProcessChunk(&data[0], data.size(), kSampleRateHz);
+
+ mean_probability += vad.last_voice_probability();
+
+ ++num_chunks;
+ }
+
+ mean_probability /= num_chunks;
+
+ EXPECT_GT(mean_probability, kMeanSpeechProbability);
+}
+
+TEST(VoiceActivityDetectorTest, Noise16kHzHasLowVoiceProbabilities) {
+ VoiceActivityDetector vad;
+
+ std::vector<int16_t> data(kLength10Ms);
+ float max_probability = 0.f;
+
+ std::srand(42);
+
+ for (size_t i = 0; i < kNumChunks; ++i) {
+ GenerateNoise(&data);
+
+ vad.ProcessChunk(&data[0], data.size(), kSampleRateHz);
+
+ // Before the |vad has enough data to process an ISAC block it will return
+ // the default value, 1.f, which would ruin the |max_probability| value.
+ if (i > kNumChunksPerIsacBlock) {
+ max_probability = std::max(max_probability, vad.last_voice_probability());
+ }
+ }
+
+ EXPECT_LT(max_probability, kMaxNoiseProbability);
+}
+
+TEST(VoiceActivityDetectorTest, Noise32kHzHasLowVoiceProbabilities) {
+ VoiceActivityDetector vad;
+
+ std::vector<int16_t> data(2 * kLength10Ms);
+ float max_probability = 0.f;
+
+ std::srand(42);
+
+ for (size_t i = 0; i < kNumChunks; ++i) {
+ GenerateNoise(&data);
+
+ vad.ProcessChunk(&data[0], data.size(), 2 * kSampleRateHz);
+
+ // Before the |vad has enough data to process an ISAC block it will return
+ // the default value, 1.f, which would ruin the |max_probability| value.
+ if (i > kNumChunksPerIsacBlock) {
+ max_probability = std::max(max_probability, vad.last_voice_probability());
+ }
+ }
+
+ EXPECT_LT(max_probability, kMaxNoiseProbability);
+}
+
+} // namespace webrtc
diff --git a/webrtc/modules/audio_processing/vad/voice_gmm_tables.h b/webrtc/modules/audio_processing/vad/voice_gmm_tables.h
new file mode 100644
index 0000000000..2f247c3798
--- /dev/null
+++ b/webrtc/modules/audio_processing/vad/voice_gmm_tables.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// GMM tables for active segments. Generated by MakeGmmTables.m.
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_GMM_TABLES_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_GMM_TABLES_H_
+
+static const int kVoiceGmmNumMixtures = 12;
+static const int kVoiceGmmDim = 3;
+
+static const double
+ kVoiceGmmCovarInverse[kVoiceGmmNumMixtures][kVoiceGmmDim][kVoiceGmmDim] = {
+ {{1.83673825579513e+00, -8.09791637570095e-04, 4.60106414365986e-03},
+ {-8.09791637570095e-04, 8.89351738394608e-04, -9.80188953277734e-04},
+ {4.60106414365986e-03, -9.80188953277734e-04, 1.38706060206582e-03}},
+ {{6.76228912850703e+01, -1.98893120119660e-02, -3.53548357253551e-03},
+ {-1.98893120119660e-02, 3.96216858500530e-05, -4.08492938394097e-05},
+ {-3.53548357253551e-03, -4.08492938394097e-05, 9.31864352856416e-04}},
+ {{9.98612435944558e+00, -5.27880954316893e-03, -6.30342541619017e-03},
+ {-5.27880954316893e-03, 4.54359480225226e-05, 6.30804591626044e-05},
+ {-6.30342541619017e-03, 6.30804591626044e-05, 5.36466441382942e-04}},
+ {{3.39917474216349e+01, -1.56213579433191e-03, -4.01459014990225e-02},
+ {-1.56213579433191e-03, 6.40415424897724e-05, 6.20076342427833e-05},
+ {-4.01459014990225e-02, 6.20076342427833e-05, 3.51199070103063e-03}},
+ {{1.34545062271428e+01, -7.94513610147144e-03, -5.34401019341728e-02},
+ {-7.94513610147144e-03, 1.16511820098649e-04, 4.66063702069293e-05},
+ {-5.34401019341728e-02, 4.66063702069293e-05, 2.72354323774163e-03}},
+ {{1.08557844314806e+02, -1.54885805673668e-02, -1.88029692674851e-02},
+ {-1.54885805673668e-02, 1.16404042786406e-04, 6.45579292702802e-06},
+ {-1.88029692674851e-02, 6.45579292702802e-06, 4.32330478391416e-04}},
+ {{8.22940066541450e+01, -1.15903110231303e-02, -4.92166764865343e-02},
+ {-1.15903110231303e-02, 7.42510742165261e-05, 3.73007314191290e-06},
+ {-4.92166764865343e-02, 3.73007314191290e-06, 3.64005221593244e-03}},
+ {{2.31133605685660e+00, -7.83261568950254e-04, 7.45744012346313e-04},
+ {-7.83261568950254e-04, 1.29460648214142e-05, -2.22774455093730e-06},
+ {7.45744012346313e-04, -2.22774455093730e-06, 1.05117294093010e-04}},
+ {{3.78767849189611e+02, 1.57759761011568e-03, -2.08551217988774e-02},
+ {1.57759761011568e-03, 4.76066236886865e-05, -2.33977412299324e-05},
+ {-2.08551217988774e-02, -2.33977412299324e-05, 5.24261005371196e-04}},
+ {{6.98580096506135e-01, -5.13850255217378e-04, -4.01124551717056e-04},
+ {-5.13850255217378e-04, 1.40501021984840e-06, -2.09496928716569e-06},
+ {-4.01124551717056e-04, -2.09496928716569e-06, 2.82879357740037e-04}},
+ {{2.62770945162399e+00, -2.31825753241430e-03, -5.30447217466318e-03},
+ {-2.31825753241430e-03, 4.59108572227649e-05, 7.67631886355405e-05},
+ {-5.30447217466318e-03, 7.67631886355405e-05, 2.28521601674098e-03}},
+ {{1.89940391362152e+02, -4.23280856852379e-03, -2.70608873541399e-02},
+ {-4.23280856852379e-03, 6.77547582742563e-05, 2.69154203800467e-05},
+ {-2.70608873541399e-02, 2.69154203800467e-05, 3.88574543373470e-03}}};
+
+static const double kVoiceGmmMean[kVoiceGmmNumMixtures][kVoiceGmmDim] = {
+ {-2.15020241646536e+00, 4.97079062999877e+02, 4.77078119504505e+02},
+ {-8.92097680029190e-01, 5.92064964199921e+02, 1.81045145941059e+02},
+ {-1.29435784144398e+00, 4.98450293410611e+02, 1.71991263804064e+02},
+ {-1.03925228397884e+00, 4.99511274321571e+02, 1.05838336539105e+02},
+ {-1.29229047206129e+00, 4.15026762566707e+02, 1.12861119017125e+02},
+ {-7.88748114599810e-01, 4.48739336688113e+02, 1.89784216956337e+02},
+ {-8.77777402332642e-01, 4.86620285054533e+02, 1.13477708016491e+02},
+ {-2.06465957063057e+00, 6.33385049870607e+02, 2.32758546796149e+02},
+ {-6.98893789231685e-01, 5.93622051503385e+02, 1.92536982473203e+02},
+ {-2.55901217508894e+00, 1.55914919756205e+03, 1.39769980835570e+02},
+ {-1.92070024165837e+00, 4.87983940444185e+02, 1.02745468128289e+02},
+ {-7.29187507662854e-01, 5.22717685022855e+02, 1.16377942283991e+02}};
+
+static const double kVoiceGmmWeights[kVoiceGmmNumMixtures] = {
+ -1.39789694361035e+01,
+ -1.19527720202104e+01,
+ -1.32396317929055e+01,
+ -1.09436815209238e+01,
+ -1.13440027478149e+01,
+ -1.12200721834504e+01,
+ -1.02537324043693e+01,
+ -1.60789861938302e+01,
+ -1.03394494048344e+01,
+ -1.83207938586818e+01,
+ -1.31186044948288e+01,
+ -9.52479998673554e+00};
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_GMM_TABLES_H_