aboutsummaryrefslogtreecommitdiff
path: root/webrtc/modules/audio_processing/agc/agc_audio_proc.cc
diff options
context:
space:
mode:
Diffstat (limited to 'webrtc/modules/audio_processing/agc/agc_audio_proc.cc')
-rw-r--r--webrtc/modules/audio_processing/agc/agc_audio_proc.cc269
1 files changed, 269 insertions, 0 deletions
diff --git a/webrtc/modules/audio_processing/agc/agc_audio_proc.cc b/webrtc/modules/audio_processing/agc/agc_audio_proc.cc
new file mode 100644
index 0000000000..dc4a5a711c
--- /dev/null
+++ b/webrtc/modules/audio_processing/agc/agc_audio_proc.cc
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/agc/agc_audio_proc.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include "webrtc/common_audio/fft4g.h"
+#include "webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h"
+#include "webrtc/modules/audio_processing/agc/pitch_internal.h"
+#include "webrtc/modules/audio_processing/agc/pole_zero_filter.h"
+extern "C" {
+#include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h"
+#include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h"
+#include "webrtc/modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
+#include "webrtc/modules/audio_coding/codecs/isac/main/source/structs.h"
+}
+#include "webrtc/modules/interface/module_common_types.h"
+
+namespace webrtc {
+
+// The following structures are declared anonymous in iSAC's structs.h. To
+// forward declare them, we use this derived class trick.
+struct AgcAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
+struct AgcAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
+
+static const float kFrequencyResolution = kSampleRateHz /
+ static_cast<float>(AgcAudioProc::kDftSize);
+static const int kSilenceRms = 5;
+
+// TODO(turajs): Make a Create or Init for AgcAudioProc.
+AgcAudioProc::AgcAudioProc()
+ : audio_buffer_(),
+ num_buffer_samples_(kNumPastSignalSamples),
+ log_old_gain_(-2),
+ old_lag_(50), // Arbitrary but valid as pitch-lag (in samples).
+ pitch_analysis_handle_(new PitchAnalysisStruct),
+ pre_filter_handle_(new PreFiltBankstr),
+ high_pass_filter_(PoleZeroFilter::Create(
+ kCoeffNumerator, kFilterOrder, kCoeffDenominator, kFilterOrder)) {
+ static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
+ sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
+ "lpc analysis window incorrect size");
+ static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
+ "correlation weight incorrect size");
+
+ // TODO(turajs): Are we doing too much in the constructor?
+ float data[kDftSize];
+ // Make FFT to initialize.
+ ip_[0] = 0;
+ WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
+ // TODO(turajs): Need to initialize high-pass filter.
+
+ // Initialize iSAC components.
+ WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
+ WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
+}
+
+AgcAudioProc::~AgcAudioProc() {}
+
+void AgcAudioProc::ResetBuffer() {
+ memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
+ sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
+ num_buffer_samples_ = kNumPastSignalSamples;
+}
+
+int AgcAudioProc::ExtractFeatures(const int16_t* frame,
+ int length,
+ AudioFeatures* features) {
+ features->num_frames = 0;
+ if (length != kNumSubframeSamples) {
+ return -1;
+ }
+
+ // High-pass filter to remove the DC component and very low frequency content.
+ // We have experienced that this high-pass filtering improves voice/non-voiced
+ // classification.
+ if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
+ &audio_buffer_[num_buffer_samples_]) != 0) {
+ return -1;
+ }
+
+ num_buffer_samples_ += kNumSubframeSamples;
+ if (num_buffer_samples_ < kBufferLength) {
+ return 0;
+ }
+ assert(num_buffer_samples_ == kBufferLength);
+ features->num_frames = kNum10msSubframes;
+ features->silence = false;
+
+ Rms(features->rms, kMaxNumFrames);
+ for (int i = 0; i < kNum10msSubframes; ++i) {
+ if (features->rms[i] < kSilenceRms) {
+ // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
+ // Bail out here instead.
+ features->silence = true;
+ ResetBuffer();
+ return 0;
+ }
+ }
+
+ PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
+ kMaxNumFrames);
+ FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
+ ResetBuffer();
+ return 0;
+}
+
+// Computes |kLpcOrder + 1| correlation coefficients.
+void AgcAudioProc::SubframeCorrelation(double* corr, int length_corr,
+ int subframe_index) {
+ assert(length_corr >= kLpcOrder + 1);
+ double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
+ int buffer_index = subframe_index * kNumSubframeSamples;
+
+ for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
+ windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
+
+ WebRtcIsac_AutoCorr(corr, windowed_audio, kNumSubframeSamples +
+ kNumPastSignalSamples, kLpcOrder);
+}
+
+// Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input.
+// The analysis window is 15 ms long and it is centered on the first half of
+// each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
+// first half of each 10 ms subframe.
+void AgcAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) {
+ assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1));
+ double corr[kLpcOrder + 1];
+ double reflec_coeff[kLpcOrder];
+ for (int i = 0, offset_lpc = 0; i < kNum10msSubframes;
+ i++, offset_lpc += kLpcOrder + 1) {
+ SubframeCorrelation(corr, kLpcOrder + 1, i);
+ corr[0] *= 1.0001;
+ // This makes Lev-Durb a bit more stable.
+ for (int k = 0; k < kLpcOrder + 1; k++) {
+ corr[k] *= kCorrWeight[k];
+ }
+ WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
+ }
+}
+
+// Fit a second order curve to these 3 points and find the location of the
+// extremum. The points are inverted before curve fitting.
+static float QuadraticInterpolation(float prev_val, float curr_val,
+ float next_val) {
+ // Doing the interpolation in |1 / A(z)|^2.
+ float fractional_index = 0;
+ next_val = 1.0f / next_val;
+ prev_val = 1.0f / prev_val;
+ curr_val = 1.0f / curr_val;
+
+ fractional_index = -(next_val - prev_val) * 0.5f / (next_val + prev_val -
+ 2.f * curr_val);
+ assert(fabs(fractional_index) < 1);
+ return fractional_index;
+}
+
+// 1 / A(z), where A(z) is defined by |lpc| is a model of the spectral envelope
+// of the input signal. The local maximum of the spectral envelope corresponds
+// with the local minimum of A(z). It saves complexity, as we save one
+// inversion. Furthermore, we find the first local maximum of magnitude squared,
+// to save on one square root.
+void AgcAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) {
+ assert(length_f_peak >= kNum10msSubframes);
+ double lpc[kNum10msSubframes * (kLpcOrder + 1)];
+ // For all sub-frames.
+ GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
+
+ const int kNumDftCoefficients = kDftSize / 2 + 1;
+ float data[kDftSize];
+
+ for (int i = 0; i < kNum10msSubframes; i++) {
+ // Convert to float with zero pad.
+ memset(data, 0, sizeof(data));
+ for (int n = 0; n < kLpcOrder + 1; n++) {
+ data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
+ }
+ // Transform to frequency domain.
+ WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
+
+ int index_peak = 0;
+ float prev_magn_sqr = data[0] * data[0];
+ float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
+ float next_magn_sqr;
+ bool found_peak = false;
+ for (int n = 2; n < kNumDftCoefficients - 1; n++) {
+ next_magn_sqr = data[2 * n] * data[2 * n] +
+ data[2 * n + 1] * data[2 * n + 1];
+ if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
+ found_peak = true;
+ index_peak = n - 1;
+ break;
+ }
+ prev_magn_sqr = curr_magn_sqr;
+ curr_magn_sqr = next_magn_sqr;
+ }
+ float fractional_index = 0;
+ if (!found_peak) {
+ // Checking if |kNumDftCoefficients - 1| is the local minimum.
+ next_magn_sqr = data[1] * data[1];
+ if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
+ index_peak = kNumDftCoefficients - 1;
+ }
+ } else {
+ // A peak is found, do a simple quadratic interpolation to get a more
+ // accurate estimate of the peak location.
+ fractional_index = QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr,
+ next_magn_sqr);
+ }
+ f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
+ }
+}
+
+// Using iSAC functions to estimate pitch gains & lags.
+void AgcAudioProc::PitchAnalysis(double* log_pitch_gains, double* pitch_lags_hz,
+ int length) {
+ // TODO(turajs): This can be "imported" from iSAC & and the next two
+ // constants.
+ assert(length >= kNum10msSubframes);
+ const int kNumPitchSubframes = 4;
+ double gains[kNumPitchSubframes];
+ double lags[kNumPitchSubframes];
+
+ const int kNumSubbandFrameSamples = 240;
+ const int kNumLookaheadSamples = 24;
+
+ float lower[kNumSubbandFrameSamples];
+ float upper[kNumSubbandFrameSamples];
+ double lower_lookahead[kNumSubbandFrameSamples];
+ double upper_lookahead[kNumSubbandFrameSamples];
+ double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
+ kNumLookaheadSamples];
+
+ // Split signal to lower and upper bands
+ WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples],
+ lower, upper, lower_lookahead, upper_lookahead,
+ pre_filter_handle_.get());
+ WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
+ pitch_analysis_handle_.get(), lags, gains);
+
+ // Lags are computed on lower-band signal with sampling rate half of the
+ // input signal.
+ GetSubframesPitchParameters(kSampleRateHz / 2, gains, lags,
+ kNumPitchSubframes, kNum10msSubframes,
+ &log_old_gain_, &old_lag_,
+ log_pitch_gains, pitch_lags_hz);
+}
+
+void AgcAudioProc::Rms(double* rms, int length_rms) {
+ assert(length_rms >= kNum10msSubframes);
+ int offset = kNumPastSignalSamples;
+ for (int i = 0; i < kNum10msSubframes; i++) {
+ rms[i] = 0;
+ for (int n = 0; n < kNumSubframeSamples; n++, offset++)
+ rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
+ rms[i] = sqrt(rms[i] / kNumSubframeSamples);
+ }
+}
+
+} // namespace webrtc