summaryrefslogtreecommitdiff
path: root/modules
diff options
context:
space:
mode:
authorbjornv@webrtc.org <bjornv@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>2014-07-10 08:03:11 +0000
committerbjornv@webrtc.org <bjornv@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>2014-07-10 08:03:11 +0000
commit31ab61cf47b1a086aea0952054e09dd47856bf87 (patch)
tree06e4474f8022a9760c2a993f8f5be620efaf6124 /modules
parent71ba40d529e6643bf7e64a1d4b3e6d5e6efb56b9 (diff)
downloadwebrtc-31ab61cf47b1a086aea0952054e09dd47856bf87.tar.gz
Neon version of SubbandCoherence()
The performance gain on a Nexus 7 reported by audioproc is ~1.4% The output is NOT bit exact. Any difference seen is +-1. BUG=3131 R=bjornv@webrtc.org, cd@webrtc.org Review URL: https://webrtc-codereview.appspot.com/17839005 Patch from Scott LaVarnway <slavarnw@gmail.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk/webrtc@6647 4adac7df-926f-26a2-2b94-8c16560cd09d
Diffstat (limited to 'modules')
-rw-r--r--modules/audio_processing/aec/aec_common.h32
-rw-r--r--modules/audio_processing/aec/aec_core.c33
-rw-r--r--modules/audio_processing/aec/aec_core_internal.h1
-rw-r--r--modules/audio_processing/aec/aec_core_neon.c292
-rw-r--r--modules/audio_processing/aec/aec_rdft.h10
5 files changed, 340 insertions, 28 deletions
diff --git a/modules/audio_processing/aec/aec_common.h b/modules/audio_processing/aec/aec_common.h
new file mode 100644
index 00000000..1e24ca99
--- /dev/null
+++ b/modules/audio_processing/aec/aec_common.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
+
+#include "webrtc/typedefs.h"
+
+#ifdef _MSC_VER /* visual c++ */
+#define ALIGN16_BEG __declspec(align(16))
+#define ALIGN16_END
+#else /* gcc or icc */
+#define ALIGN16_BEG
+#define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_sqrtHanning[65];
+extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_weightCurve[65];
+extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_overDriveCurve[65];
+extern const float WebRtcAec_kExtendedSmoothingCoefficients[2][2];
+extern const float WebRtcAec_kNormalSmoothingCoefficients[2][2];
+extern const float WebRtcAec_kMinFarendPSD;
+
+#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
+
diff --git a/modules/audio_processing/aec/aec_core.c b/modules/audio_processing/aec/aec_core.c
index 2fd298c0..139d37d7 100644
--- a/modules/audio_processing/aec/aec_core.c
+++ b/modules/audio_processing/aec/aec_core.c
@@ -21,6 +21,7 @@
#include <string.h>
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
#include "webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h"
@@ -45,7 +46,7 @@ static const int freqAvgIc = PART_LEN / 2;
// Matlab code to produce table:
// win = sqrt(hanning(63)); win = [0 ; win(1:32)];
// fprintf(1, '\t%.14f, %.14f, %.14f,\n', win);
-static const float sqrtHanning[65] = {
+ALIGN16_BEG const float ALIGN16_END WebRtcAec_sqrtHanning[65] = {
0.00000000000000f, 0.02454122852291f, 0.04906767432742f, 0.07356456359967f,
0.09801714032956f, 0.12241067519922f, 0.14673047445536f, 0.17096188876030f,
0.19509032201613f, 0.21910124015687f, 0.24298017990326f, 0.26671275747490f,
@@ -99,10 +100,10 @@ static const float kTargetSupp[3] = {-6.9f, -11.5f, -18.4f};
// Two sets of parameters, one for the extended filter mode.
static const float kExtendedMinOverDrive[3] = {3.0f, 6.0f, 15.0f};
static const float kNormalMinOverDrive[3] = {1.0f, 2.0f, 5.0f};
-static const float kExtendedSmoothingCoefficients[2][2] = {{0.9f, 0.1f},
- {0.92f, 0.08f}};
-static const float kNormalSmoothingCoefficients[2][2] = {{0.9f, 0.1f},
- {0.93f, 0.07f}};
+const float WebRtcAec_kExtendedSmoothingCoefficients[2][2] = {{0.9f, 0.1f},
+ {0.92f, 0.08f}};
+const float WebRtcAec_kNormalSmoothingCoefficients[2][2] = {{0.9f, 0.1f},
+ {0.93f, 0.07f}};
// Number of partitions forming the NLP's "preferred" bands.
enum {
@@ -442,7 +443,7 @@ static int PartitionDelay(const AecCore* aec) {
}
// Threshold to protect against the ill-effects of a zero far-end.
-static const float kMinFarendPSD = 15;
+const float WebRtcAec_kMinFarendPSD = 15;
// Updates the following smoothed Power Spectral Densities (PSD):
// - sd : near-end
@@ -459,8 +460,8 @@ static void SmoothedPSD(AecCore* aec,
float xfw[2][PART_LEN1]) {
// Power estimate smoothing coefficients.
const float* ptrGCoh = aec->extended_filter_enabled
- ? kExtendedSmoothingCoefficients[aec->mult - 1]
- : kNormalSmoothingCoefficients[aec->mult - 1];
+ ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
+ : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
int i;
float sdSum = 0, seSum = 0;
@@ -476,7 +477,8 @@ static void SmoothedPSD(AecCore* aec,
aec->sx[i] =
ptrGCoh[0] * aec->sx[i] +
ptrGCoh[1] * WEBRTC_SPL_MAX(
- xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], kMinFarendPSD);
+ xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
+ WebRtcAec_kMinFarendPSD);
aec->sde[i][0] =
ptrGCoh[0] * aec->sde[i][0] +
@@ -511,8 +513,9 @@ static void SmoothedPSD(AecCore* aec,
__inline static void WindowData(float* x_windowed, const float* x) {
int i;
for (i = 0; i < PART_LEN; i++) {
- x_windowed[i] = x[i] * sqrtHanning[i];
- x_windowed[PART_LEN + i] = x[PART_LEN + i] * sqrtHanning[PART_LEN - i];
+ x_windowed[i] = x[i] * WebRtcAec_sqrtHanning[i];
+ x_windowed[PART_LEN + i] =
+ x[PART_LEN + i] * WebRtcAec_sqrtHanning[PART_LEN - i];
}
}
@@ -1347,10 +1350,10 @@ static void NonLinearProcessing(AecCore* aec, float* output, float* outputH) {
scale = 2.0f / PART_LEN2;
for (i = 0; i < PART_LEN; i++) {
fft[i] *= scale; // fft scaling
- fft[i] = fft[i] * sqrtHanning[i] + aec->outBuf[i];
+ fft[i] = fft[i] * WebRtcAec_sqrtHanning[i] + aec->outBuf[i];
fft[PART_LEN + i] *= scale; // fft scaling
- aec->outBuf[i] = fft[PART_LEN + i] * sqrtHanning[PART_LEN - i];
+ aec->outBuf[i] = fft[PART_LEN + i] * WebRtcAec_sqrtHanning[PART_LEN - i];
// Saturate output to keep it in the allowed range.
output[i] = WEBRTC_SPL_SAT(
@@ -1737,8 +1740,8 @@ static void TimeToFrequency(float time_data[PART_LEN2],
// TODO(bjornv): Should we have a different function/wrapper for windowed FFT?
if (window) {
for (i = 0; i < PART_LEN; i++) {
- time_data[i] *= sqrtHanning[i];
- time_data[PART_LEN + i] *= sqrtHanning[PART_LEN - i];
+ time_data[i] *= WebRtcAec_sqrtHanning[i];
+ time_data[PART_LEN + i] *= WebRtcAec_sqrtHanning[PART_LEN - i];
}
}
diff --git a/modules/audio_processing/aec/aec_core_internal.h b/modules/audio_processing/aec/aec_core_internal.h
index 372b4274..8e5ee5cb 100644
--- a/modules/audio_processing/aec/aec_core_internal.h
+++ b/modules/audio_processing/aec/aec_core_internal.h
@@ -15,6 +15,7 @@
#include <stdio.h>
#endif
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
#include "webrtc/modules/audio_processing/aec/aec_core.h"
#include "webrtc/modules/audio_processing/utility/ring_buffer.h"
#include "webrtc/typedefs.h"
diff --git a/modules/audio_processing/aec/aec_core_neon.c b/modules/audio_processing/aec/aec_core_neon.c
index 13ca47af..a21a954b 100644
--- a/modules/audio_processing/aec/aec_core_neon.c
+++ b/modules/audio_processing/aec/aec_core_neon.c
@@ -14,12 +14,12 @@
* Based on aec_core_sse2.c.
*/
-#include "webrtc/modules/audio_processing/aec/aec_core.h"
-
#include <arm_neon.h>
#include <math.h>
#include <string.h> // memset
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
@@ -250,9 +250,6 @@ static void FilterAdaptationNEON(AecCore* aec,
}
}
-extern const float WebRtcAec_weightCurve[65];
-extern const float WebRtcAec_overDriveCurve[65];
-
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
// a^b = exp2(b * log2(a))
// exp2(x) and log2(x) are calculated using polynomial approximations.
@@ -442,10 +439,295 @@ static void OverdriveAndSuppressNEON(AecCore* aec,
}
}
+static int PartitionDelay(const AecCore* aec) {
+ // Measures the energy in each filter partition and returns the partition with
+ // highest energy.
+ // TODO(bjornv): Spread computational cost by computing one partition per
+ // block?
+ float wfEnMax = 0;
+ int i;
+ int delay = 0;
+
+ for (i = 0; i < aec->num_partitions; i++) {
+ int j;
+ int pos = i * PART_LEN1;
+ float wfEn = 0;
+ float32x4_t vec_wfEn = vdupq_n_f32(0.0f);
+ // vectorized code (four at once)
+ for (j = 0; j + 3 < PART_LEN1; j += 4) {
+ const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]);
+ const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]);
+ vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0);
+ vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1);
+ }
+ {
+ float32x2_t vec_total;
+ // A B C D
+ vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn));
+ // A+B C+D
+ vec_total = vpadd_f32(vec_total, vec_total);
+ // A+B+C+D A+B+C+D
+ wfEn = vget_lane_f32(vec_total, 0);
+ }
+
+ // scalar code for the remaining items.
+ for (; j < PART_LEN1; j++) {
+ wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
+ aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
+ }
+
+ if (wfEn > wfEnMax) {
+ wfEnMax = wfEn;
+ delay = i;
+ }
+ }
+ return delay;
+}
+
+// Updates the following smoothed Power Spectral Densities (PSD):
+// - sd : near-end
+// - se : residual echo
+// - sx : far-end
+// - sde : cross-PSD of near-end and residual echo
+// - sxd : cross-PSD of near-end and far-end
+//
+// In addition to updating the PSDs, also the filter diverge state is determined
+// upon actions are taken.
+static void SmoothedPSD(AecCore* aec,
+ float efw[2][PART_LEN1],
+ float dfw[2][PART_LEN1],
+ float xfw[2][PART_LEN1]) {
+ // Power estimate smoothing coefficients.
+ const float* ptrGCoh = aec->extended_filter_enabled
+ ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
+ : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
+ int i;
+ float sdSum = 0, seSum = 0;
+ const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD);
+ float32x4_t vec_sdSum = vdupq_n_f32(0.0f);
+ float32x4_t vec_seSum = vdupq_n_f32(0.0f);
+
+ for (i = 0; i + 3 < PART_LEN1; i += 4) {
+ const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]);
+ const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]);
+ const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]);
+ const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]);
+ const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]);
+ const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]);
+ float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]);
+ float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]);
+ float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]);
+ float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0);
+ float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0);
+ float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0);
+
+ vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1);
+ vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1);
+ vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1);
+ vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15);
+ vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]);
+ vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]);
+ vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]);
+
+ vst1q_f32(&aec->sd[i], vec_sd);
+ vst1q_f32(&aec->se[i], vec_se);
+ vst1q_f32(&aec->sx[i], vec_sx);
+
+ {
+ float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
+ float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0);
+ float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1);
+ vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]);
+ vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]);
+ vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1);
+ vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0);
+ vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]);
+ vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]);
+ vst2q_f32(&aec->sde[i][0], vec_sde);
+ }
+
+ {
+ float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
+ float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0);
+ float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1);
+ vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]);
+ vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]);
+ vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1);
+ vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0);
+ vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]);
+ vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]);
+ vst2q_f32(&aec->sxd[i][0], vec_sxd);
+ }
+
+ vec_sdSum = vaddq_f32(vec_sdSum, vec_sd);
+ vec_seSum = vaddq_f32(vec_seSum, vec_se);
+ }
+ {
+ float32x2_t vec_sdSum_total;
+ float32x2_t vec_seSum_total;
+ // A B C D
+ vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum),
+ vget_high_f32(vec_sdSum));
+ vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum),
+ vget_high_f32(vec_seSum));
+ // A+B C+D
+ vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total);
+ vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total);
+ // A+B+C+D A+B+C+D
+ sdSum = vget_lane_f32(vec_sdSum_total, 0);
+ seSum = vget_lane_f32(vec_seSum_total, 0);
+ }
+
+ // scalar code for the remaining items.
+ for (; i < PART_LEN1; i++) {
+ aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
+ ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
+ aec->se[i] = ptrGCoh[0] * aec->se[i] +
+ ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
+ // We threshold here to protect against the ill-effects of a zero farend.
+ // The threshold is not arbitrarily chosen, but balances protection and
+ // adverse interaction with the algorithm's tuning.
+ // TODO(bjornv): investigate further why this is so sensitive.
+ aec->sx[i] =
+ ptrGCoh[0] * aec->sx[i] +
+ ptrGCoh[1] * WEBRTC_SPL_MAX(
+ xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
+ WebRtcAec_kMinFarendPSD);
+
+ aec->sde[i][0] =
+ ptrGCoh[0] * aec->sde[i][0] +
+ ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
+ aec->sde[i][1] =
+ ptrGCoh[0] * aec->sde[i][1] +
+ ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
+
+ aec->sxd[i][0] =
+ ptrGCoh[0] * aec->sxd[i][0] +
+ ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
+ aec->sxd[i][1] =
+ ptrGCoh[0] * aec->sxd[i][1] +
+ ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
+
+ sdSum += aec->sd[i];
+ seSum += aec->se[i];
+ }
+
+ // Divergent filter safeguard.
+ aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
+
+ if (aec->divergeState)
+ memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
+
+ // Reset if error is significantly larger than nearend (13 dB).
+ if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
+ memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
+}
+
+// Window time domain data to be used by the fft.
+__inline static void WindowData(float* x_windowed, const float* x) {
+ int i;
+ for (i = 0; i < PART_LEN; i += 4) {
+ const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);
+ const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]);
+ const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]);
+ // A B C D
+ float32x4_t vec_sqrtHanning_rev =
+ vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
+ // B A D C
+ vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);
+ // D C B A
+ vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),
+ vget_low_f32(vec_sqrtHanning_rev));
+ vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));
+ vst1q_f32(&x_windowed[PART_LEN + i],
+ vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));
+ }
+}
+
+// Puts fft output data into a complex valued array.
+__inline static void StoreAsComplex(const float* data,
+ float data_complex[2][PART_LEN1]) {
+ int i;
+ for (i = 0; i < PART_LEN; i += 4) {
+ const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]);
+ vst1q_f32(&data_complex[0][i], vec_data.val[0]);
+ vst1q_f32(&data_complex[1][i], vec_data.val[1]);
+ }
+ // fix beginning/end values
+ data_complex[1][0] = 0;
+ data_complex[1][PART_LEN] = 0;
+ data_complex[0][0] = data[0];
+ data_complex[0][PART_LEN] = data[1];
+}
+
+static void SubbandCoherenceNEON(AecCore* aec,
+ float efw[2][PART_LEN1],
+ float xfw[2][PART_LEN1],
+ float* fft,
+ float* cohde,
+ float* cohxd) {
+ float dfw[2][PART_LEN1];
+ int i;
+
+ if (aec->delayEstCtr == 0)
+ aec->delayIdx = PartitionDelay(aec);
+
+ // Use delayed far.
+ memcpy(xfw,
+ aec->xfwBuf + aec->delayIdx * PART_LEN1,
+ sizeof(xfw[0][0]) * 2 * PART_LEN1);
+
+ // Windowed near fft
+ WindowData(fft, aec->dBuf);
+ aec_rdft_forward_128(fft);
+ StoreAsComplex(fft, dfw);
+
+ // Windowed error fft
+ WindowData(fft, aec->eBuf);
+ aec_rdft_forward_128(fft);
+ StoreAsComplex(fft, efw);
+
+ SmoothedPSD(aec, efw, dfw, xfw);
+
+ {
+ const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f);
+
+ // Subband coherence
+ for (i = 0; i + 3 < PART_LEN1; i += 4) {
+ const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);
+ const float32x4_t vec_se = vld1q_f32(&aec->se[i]);
+ const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);
+ const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);
+ const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);
+ float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
+ float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
+ float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);
+ float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);
+ vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);
+ vec_cohde = vdivq_f32(vec_cohde, vec_sdse);
+ vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);
+ vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);
+
+ vst1q_f32(&cohde[i], vec_cohde);
+ vst1q_f32(&cohxd[i], vec_cohxd);
+ }
+ }
+ // scalar code for the remaining items.
+ for (; i < PART_LEN1; i++) {
+ cohde[i] =
+ (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
+ (aec->sd[i] * aec->se[i] + 1e-10f);
+ cohxd[i] =
+ (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
+ (aec->sx[i] * aec->sd[i] + 1e-10f);
+ }
+}
+
void WebRtcAec_InitAec_neon(void) {
WebRtcAec_FilterFar = FilterFarNEON;
WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON;
WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
+ WebRtcAec_SubbandCoherence = SubbandCoherenceNEON;
}
diff --git a/modules/audio_processing/aec/aec_rdft.h b/modules/audio_processing/aec/aec_rdft.h
index 94301601..3b339a05 100644
--- a/modules/audio_processing/aec/aec_rdft.h
+++ b/modules/audio_processing/aec/aec_rdft.h
@@ -11,6 +11,8 @@
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
+
// These intrinsics were unavailable before VS 2008.
// TODO(andrew): move to a common file.
#if defined(_MSC_VER) && _MSC_VER < 1500
@@ -19,14 +21,6 @@ static __inline __m128 _mm_castsi128_ps(__m128i a) { return *(__m128*)&a; }
static __inline __m128i _mm_castps_si128(__m128 a) { return *(__m128i*)&a; }
#endif
-#ifdef _MSC_VER /* visual c++ */
-#define ALIGN16_BEG __declspec(align(16))
-#define ALIGN16_END
-#else /* gcc or icc */
-#define ALIGN16_BEG
-#define ALIGN16_END __attribute__((aligned(16)))
-#endif
-
// constants shared by all paths (C, SSE2).
extern float rdft_w[64];
// constants used by the C path.