Merge changes I7bbf776e,I1b827825

am: fe8b4a6579 * commit 'fe8b4a657979b49e1701bd92f6d5814a99e0b2be': (7237 commits) WIP: Changes after merge commit 'cb3f9bd' Make the nonlinear beamformer steerable Utilize bitrate above codec max to protect video. Enable VP9 internal resize by default. Filter overlapping RTP header extensions. Make VCMEncodedFrameCallback const. MediaCodecVideoEncoder: Add number of quality resolution downscales to Encoded callback. Remove redudant encoder rate calls. Create isolate files for nonparallel tests. Register header extensions in RtpRtcpObserver to avoid log spam. Make an enum class out of NetEqDecoder, and hide the neteq_decoders_ table ACM: Move NACK functionality inside NetEq Fix chromium-style warnings in webrtc/sound/. Create a 'webrtc_nonparallel_tests' target. Update scalability structure data according to updates in the RTP payload profile. audio_coding: rename interface -> include Rewrote perform_action_on_all_files to be parallell. Update reference indices according to updates in the RTP payload profile. Disable P2PTransport...TestFailoverControlledSide on Memcheck pass clangcl compile options to ignore warnings in gflags.cc ...
author: Chih-hung Hsieh <chh@google.com> 2015-12-01 17:07:48 +0000
committer: android-build-merger <android-build-merger@google.com> 2015-12-01 17:07:48 +0000
commit: a4acd9d6bc9b3b033d7d274316e75ee067df8d20 (patch)
tree: 672a185b294789cf991f385c3e395dd63bea9063 /webrtc/modules/audio_processing/aec
parent: 3681b90ba4fe7a27232dd3e27897d5d7ed9d651c (diff)
parent: fe8b4a657979b49e1701bd92f6d5814a99e0b2be (diff)
download: webrtc-a4acd9d6bc9b3b033d7d274316e75ee067df8d20.tar.gz
20 files changed, 9338 insertions, 0 deletions
diff --git a/webrtc/modules/audio_processing/aec/Android.mk b/webrtc/modules/audio_processing/aec/Android.mk
new file mode 100644
index 0000000000..819282d140
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/Android.mk
@@ -0,0 +1,53 @@
+# Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+include $(LOCAL_PATH)/../../../../android-webrtc.mk
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+LOCAL_MODULE := libwebrtc_aec
+LOCAL_MODULE_TAGS := optional
+LOCAL_SRC_FILES := \
+    echo_cancellation.c \
+    aec_resampler.c \
+    aec_core.c \
+    aec_rdft.c \
+
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),x86 x86_64))
+LOCAL_SRC_FILES += \
+    aec_core_sse2.c \
+    aec_rdft_sse2.c
+endif
+
+# Flags passed to both C and C++ files.
+LOCAL_CFLAGS := \
+    $(MY_WEBRTC_COMMON_DEFS)
+
+LOCAL_CFLAGS_arm := $(MY_WEBRTC_COMMON_DEFS_arm)
+LOCAL_CFLAGS_x86 := $(MY_WEBRTC_COMMON_DEFS_x86)
+LOCAL_CFLAGS_mips := $(MY_WEBRTC_COMMON_DEFS_mips)
+LOCAL_CFLAGS_arm64 := $(MY_WEBRTC_COMMON_DEFS_arm64)
+LOCAL_CFLAGS_x86_64 := $(MY_WEBRTC_COMMON_DEFS_x86_64)
+LOCAL_CFLAGS_mips64 := $(MY_WEBRTC_COMMON_DEFS_mips64)
+
+LOCAL_C_INCLUDES := \
+    $(LOCAL_PATH)/include \
+    $(LOCAL_PATH)/../utility \
+    $(LOCAL_PATH)/../../../.. \
+    $(LOCAL_PATH)/../../../common_audio/signal_processing/include
+
+ifdef WEBRTC_STL
+LOCAL_NDK_STL_VARIANT := $(WEBRTC_STL)
+LOCAL_SDK_VERSION := 14
+LOCAL_MODULE := $(LOCAL_MODULE)_$(WEBRTC_STL)
+endif
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/webrtc/modules/audio_processing/aec/aec_common.h b/webrtc/modules/audio_processing/aec/aec_common.h
new file mode 100644
index 0000000000..1e24ca9960
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_common.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
+
+#include "webrtc/typedefs.h"
+
+#ifdef _MSC_VER /* visual c++ */
+#define ALIGN16_BEG __declspec(align(16))
+#define ALIGN16_END
+#else /* gcc or icc */
+#define ALIGN16_BEG
+#define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_sqrtHanning[65];
+extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_weightCurve[65];
+extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_overDriveCurve[65];
+extern const float WebRtcAec_kExtendedSmoothingCoefficients[2][2];
+extern const float WebRtcAec_kNormalSmoothingCoefficients[2][2];
+extern const float WebRtcAec_kMinFarendPSD;
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
+
diff --git a/webrtc/modules/audio_processing/aec/aec_core.c b/webrtc/modules/audio_processing/aec/aec_core.c
new file mode 100644
index 0000000000..f8eed32372
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_core.c
@@ -0,0 +1,1929 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The core AEC algorithm, which is presented with time-aligned signals.
+ */
+
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+#include <stdio.h>
+#endif
+
+#include <assert.h>
+#include <math.h>
+#include <stddef.h>  // size_t
+#include <stdlib.h>
+#include <string.h>
+
+#include "webrtc/common_audio/ring_buffer.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
+#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+#include "webrtc/modules/audio_processing/logging/aec_logging.h"
+#include "webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h"
+#include "webrtc/system_wrappers/include/cpu_features_wrapper.h"
+#include "webrtc/typedefs.h"
+
+
+// Buffer size (samples)
+static const size_t kBufSizePartitions = 250;  // 1 second of audio in 16 kHz.
+
+// Metrics
+static const int subCountLen = 4;
+static const int countLen = 50;
+static const int kDelayMetricsAggregationWindow = 1250;  // 5 seconds at 16 kHz.
+
+// Quantities to control H band scaling for SWB input
+static const int flagHbandCn = 1;  // flag for adding comfort noise in H band
+static const float cnScaleHband =
+    (float)0.4;  // scale for comfort noise in H band
+// Initial bin for averaging nlp gain in low band
+static const int freqAvgIc = PART_LEN / 2;
+
+// Matlab code to produce table:
+// win = sqrt(hanning(63)); win = [0 ; win(1:32)];
+// fprintf(1, '\t%.14f, %.14f, %.14f,\n', win);
+ALIGN16_BEG const float ALIGN16_END WebRtcAec_sqrtHanning[65] = {
+    0.00000000000000f, 0.02454122852291f, 0.04906767432742f, 0.07356456359967f,
+    0.09801714032956f, 0.12241067519922f, 0.14673047445536f, 0.17096188876030f,
+    0.19509032201613f, 0.21910124015687f, 0.24298017990326f, 0.26671275747490f,
+    0.29028467725446f, 0.31368174039889f, 0.33688985339222f, 0.35989503653499f,
+    0.38268343236509f, 0.40524131400499f, 0.42755509343028f, 0.44961132965461f,
+    0.47139673682600f, 0.49289819222978f, 0.51410274419322f, 0.53499761988710f,
+    0.55557023301960f, 0.57580819141785f, 0.59569930449243f, 0.61523159058063f,
+    0.63439328416365f, 0.65317284295378f, 0.67155895484702f, 0.68954054473707f,
+    0.70710678118655f, 0.72424708295147f, 0.74095112535496f, 0.75720884650648f,
+    0.77301045336274f, 0.78834642762661f, 0.80320753148064f, 0.81758481315158f,
+    0.83146961230255f, 0.84485356524971f, 0.85772861000027f, 0.87008699110871f,
+    0.88192126434835f, 0.89322430119552f, 0.90398929312344f, 0.91420975570353f,
+    0.92387953251129f, 0.93299279883474f, 0.94154406518302f, 0.94952818059304f,
+    0.95694033573221f, 0.96377606579544f, 0.97003125319454f, 0.97570213003853f,
+    0.98078528040323f, 0.98527764238894f, 0.98917650996478f, 0.99247953459871f,
+    0.99518472667220f, 0.99729045667869f, 0.99879545620517f, 0.99969881869620f,
+    1.00000000000000f};
+
+// Matlab code to produce table:
+// weightCurve = [0 ; 0.3 * sqrt(linspace(0,1,64))' + 0.1];
+// fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', weightCurve);
+ALIGN16_BEG const float ALIGN16_END WebRtcAec_weightCurve[65] = {
+    0.0000f, 0.1000f, 0.1378f, 0.1535f, 0.1655f, 0.1756f, 0.1845f, 0.1926f,
+    0.2000f, 0.2069f, 0.2134f, 0.2195f, 0.2254f, 0.2309f, 0.2363f, 0.2414f,
+    0.2464f, 0.2512f, 0.2558f, 0.2604f, 0.2648f, 0.2690f, 0.2732f, 0.2773f,
+    0.2813f, 0.2852f, 0.2890f, 0.2927f, 0.2964f, 0.3000f, 0.3035f, 0.3070f,
+    0.3104f, 0.3138f, 0.3171f, 0.3204f, 0.3236f, 0.3268f, 0.3299f, 0.3330f,
+    0.3360f, 0.3390f, 0.3420f, 0.3449f, 0.3478f, 0.3507f, 0.3535f, 0.3563f,
+    0.3591f, 0.3619f, 0.3646f, 0.3673f, 0.3699f, 0.3726f, 0.3752f, 0.3777f,
+    0.3803f, 0.3828f, 0.3854f, 0.3878f, 0.3903f, 0.3928f, 0.3952f, 0.3976f,
+    0.4000f};
+
+// Matlab code to produce table:
+// overDriveCurve = [sqrt(linspace(0,1,65))' + 1];
+// fprintf(1, '\t%.4f, %.4f, %.4f, %.4f, %.4f, %.4f,\n', overDriveCurve);
+ALIGN16_BEG const float ALIGN16_END WebRtcAec_overDriveCurve[65] = {
+    1.0000f, 1.1250f, 1.1768f, 1.2165f, 1.2500f, 1.2795f, 1.3062f, 1.3307f,
+    1.3536f, 1.3750f, 1.3953f, 1.4146f, 1.4330f, 1.4507f, 1.4677f, 1.4841f,
+    1.5000f, 1.5154f, 1.5303f, 1.5449f, 1.5590f, 1.5728f, 1.5863f, 1.5995f,
+    1.6124f, 1.6250f, 1.6374f, 1.6495f, 1.6614f, 1.6731f, 1.6847f, 1.6960f,
+    1.7071f, 1.7181f, 1.7289f, 1.7395f, 1.7500f, 1.7603f, 1.7706f, 1.7806f,
+    1.7906f, 1.8004f, 1.8101f, 1.8197f, 1.8292f, 1.8385f, 1.8478f, 1.8570f,
+    1.8660f, 1.8750f, 1.8839f, 1.8927f, 1.9014f, 1.9100f, 1.9186f, 1.9270f,
+    1.9354f, 1.9437f, 1.9520f, 1.9601f, 1.9682f, 1.9763f, 1.9843f, 1.9922f,
+    2.0000f};
+
+// Delay Agnostic AEC parameters, still under development and may change.
+static const float kDelayQualityThresholdMax = 0.07f;
+static const float kDelayQualityThresholdMin = 0.01f;
+static const int kInitialShiftOffset = 5;
+#if !defined(WEBRTC_ANDROID)
+static const int kDelayCorrectionStart = 1500;  // 10 ms chunks
+#endif
+
+// Target suppression levels for nlp modes.
+// log{0.001, 0.00001, 0.00000001}
+static const float kTargetSupp[3] = {-6.9f, -11.5f, -18.4f};
+
+// Two sets of parameters, one for the extended filter mode.
+static const float kExtendedMinOverDrive[3] = {3.0f, 6.0f, 15.0f};
+static const float kNormalMinOverDrive[3] = {1.0f, 2.0f, 5.0f};
+const float WebRtcAec_kExtendedSmoothingCoefficients[2][2] = {{0.9f, 0.1f},
+                                                              {0.92f, 0.08f}};
+const float WebRtcAec_kNormalSmoothingCoefficients[2][2] = {{0.9f, 0.1f},
+                                                            {0.93f, 0.07f}};
+
+// Number of partitions forming the NLP's "preferred" bands.
+enum {
+  kPrefBandSize = 24
+};
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+extern int webrtc_aec_instance_count;
+#endif
+
+WebRtcAecFilterFar WebRtcAec_FilterFar;
+WebRtcAecScaleErrorSignal WebRtcAec_ScaleErrorSignal;
+WebRtcAecFilterAdaptation WebRtcAec_FilterAdaptation;
+WebRtcAecOverdriveAndSuppress WebRtcAec_OverdriveAndSuppress;
+WebRtcAecComfortNoise WebRtcAec_ComfortNoise;
+WebRtcAecSubBandCoherence WebRtcAec_SubbandCoherence;
+
+__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
+  return aRe * bRe - aIm * bIm;
+}
+
+__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
+  return aRe * bIm + aIm * bRe;
+}
+
+static int CmpFloat(const void* a, const void* b) {
+  const float* da = (const float*)a;
+  const float* db = (const float*)b;
+
+  return (*da > *db) - (*da < *db);
+}
+
+static void FilterFar(AecCore* aec, float yf[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < aec->num_partitions; i++) {
+    int j;
+    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
+    int pos = i * PART_LEN1;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= aec->num_partitions) {
+      xPos -= aec->num_partitions * (PART_LEN1);
+    }
+
+    for (j = 0; j < PART_LEN1; j++) {
+      yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
+                        aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][pos + j],
+                        aec->wfBuf[1][pos + j]);
+      yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
+                        aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][pos + j],
+                        aec->wfBuf[1][pos + j]);
+    }
+  }
+}
+
+static void ScaleErrorSignal(AecCore* aec, float ef[2][PART_LEN1]) {
+  const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
+  const float error_threshold = aec->extended_filter_enabled
+                                    ? kExtendedErrorThreshold
+                                    : aec->normal_error_threshold;
+  int i;
+  float abs_ef;
+  for (i = 0; i < (PART_LEN1); i++) {
+    ef[0][i] /= (aec->xPow[i] + 1e-10f);
+    ef[1][i] /= (aec->xPow[i] + 1e-10f);
+    abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
+
+    if (abs_ef > error_threshold) {
+      abs_ef = error_threshold / (abs_ef + 1e-10f);
+      ef[0][i] *= abs_ef;
+      ef[1][i] *= abs_ef;
+    }
+
+    // Stepsize factor
+    ef[0][i] *= mu;
+    ef[1][i] *= mu;
+  }
+}
+
+// Time-unconstrined filter adaptation.
+// TODO(andrew): consider for a low-complexity mode.
+// static void FilterAdaptationUnconstrained(AecCore* aec, float *fft,
+//                                          float ef[2][PART_LEN1]) {
+//  int i, j;
+//  for (i = 0; i < aec->num_partitions; i++) {
+//    int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
+//    int pos;
+//    // Check for wrap
+//    if (i + aec->xfBufBlockPos >= aec->num_partitions) {
+//      xPos -= aec->num_partitions * PART_LEN1;
+//    }
+//
+//    pos = i * PART_LEN1;
+//
+//    for (j = 0; j < PART_LEN1; j++) {
+//      aec->wfBuf[0][pos + j] += MulRe(aec->xfBuf[0][xPos + j],
+//                                      -aec->xfBuf[1][xPos + j],
+//                                      ef[0][j], ef[1][j]);
+//      aec->wfBuf[1][pos + j] += MulIm(aec->xfBuf[0][xPos + j],
+//                                      -aec->xfBuf[1][xPos + j],
+//                                      ef[0][j], ef[1][j]);
+//    }
+//  }
+//}
+
+static void FilterAdaptation(AecCore* aec, float* fft, float ef[2][PART_LEN1]) {
+  int i, j;
+  for (i = 0; i < aec->num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
+    int pos;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= aec->num_partitions) {
+      xPos -= aec->num_partitions * PART_LEN1;
+    }
+
+    pos = i * PART_LEN1;
+
+    for (j = 0; j < PART_LEN; j++) {
+
+      fft[2 * j] = MulRe(aec->xfBuf[0][xPos + j],
+                         -aec->xfBuf[1][xPos + j],
+                         ef[0][j],
+                         ef[1][j]);
+      fft[2 * j + 1] = MulIm(aec->xfBuf[0][xPos + j],
+                             -aec->xfBuf[1][xPos + j],
+                             ef[0][j],
+                             ef[1][j]);
+    }
+    fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
+                   -aec->xfBuf[1][xPos + PART_LEN],
+                   ef[0][PART_LEN],
+                   ef[1][PART_LEN]);
+
+    aec_rdft_inverse_128(fft);
+    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
+
+    // fft scaling
+    {
+      float scale = 2.0f / PART_LEN2;
+      for (j = 0; j < PART_LEN; j++) {
+        fft[j] *= scale;
+      }
+    }
+    aec_rdft_forward_128(fft);
+
+    aec->wfBuf[0][pos] += fft[0];
+    aec->wfBuf[0][pos + PART_LEN] += fft[1];
+
+    for (j = 1; j < PART_LEN; j++) {
+      aec->wfBuf[0][pos + j] += fft[2 * j];
+      aec->wfBuf[1][pos + j] += fft[2 * j + 1];
+    }
+  }
+}
+
+static void OverdriveAndSuppress(AecCore* aec,
+                                 float hNl[PART_LEN1],
+                                 const float hNlFb,
+                                 float efw[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < PART_LEN1; i++) {
+    // Weight subbands
+    if (hNl[i] > hNlFb) {
+      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
+               (1 - WebRtcAec_weightCurve[i]) * hNl[i];
+    }
+    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
+
+    // Suppress error signal
+    efw[0][i] *= hNl[i];
+    efw[1][i] *= hNl[i];
+
+    // Ooura fft returns incorrect sign on imaginary component. It matters here
+    // because we are making an additive change with comfort noise.
+    efw[1][i] *= -1;
+  }
+}
+
+static int PartitionDelay(const AecCore* aec) {
+  // Measures the energy in each filter partition and returns the partition with
+  // highest energy.
+  // TODO(bjornv): Spread computational cost by computing one partition per
+  // block?
+  float wfEnMax = 0;
+  int i;
+  int delay = 0;
+
+  for (i = 0; i < aec->num_partitions; i++) {
+    int j;
+    int pos = i * PART_LEN1;
+    float wfEn = 0;
+    for (j = 0; j < PART_LEN1; j++) {
+      wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
+          aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
+    }
+
+    if (wfEn > wfEnMax) {
+      wfEnMax = wfEn;
+      delay = i;
+    }
+  }
+  return delay;
+}
+
+// Threshold to protect against the ill-effects of a zero far-end.
+const float WebRtcAec_kMinFarendPSD = 15;
+
+// Updates the following smoothed  Power Spectral Densities (PSD):
+//  - sd  : near-end
+//  - se  : residual echo
+//  - sx  : far-end
+//  - sde : cross-PSD of near-end and residual echo
+//  - sxd : cross-PSD of near-end and far-end
+//
+// In addition to updating the PSDs, also the filter diverge state is determined
+// upon actions are taken.
+static void SmoothedPSD(AecCore* aec,
+                        float efw[2][PART_LEN1],
+                        float dfw[2][PART_LEN1],
+                        float xfw[2][PART_LEN1]) {
+  // Power estimate smoothing coefficients.
+  const float* ptrGCoh = aec->extended_filter_enabled
+      ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
+      : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
+  int i;
+  float sdSum = 0, seSum = 0;
+
+  for (i = 0; i < PART_LEN1; i++) {
+    aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
+                 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
+    aec->se[i] = ptrGCoh[0] * aec->se[i] +
+                 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
+    // We threshold here to protect against the ill-effects of a zero farend.
+    // The threshold is not arbitrarily chosen, but balances protection and
+    // adverse interaction with the algorithm's tuning.
+    // TODO(bjornv): investigate further why this is so sensitive.
+    aec->sx[i] =
+        ptrGCoh[0] * aec->sx[i] +
+        ptrGCoh[1] * WEBRTC_SPL_MAX(
+            xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
+            WebRtcAec_kMinFarendPSD);
+
+    aec->sde[i][0] =
+        ptrGCoh[0] * aec->sde[i][0] +
+        ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
+    aec->sde[i][1] =
+        ptrGCoh[0] * aec->sde[i][1] +
+        ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
+
+    aec->sxd[i][0] =
+        ptrGCoh[0] * aec->sxd[i][0] +
+        ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
+    aec->sxd[i][1] =
+        ptrGCoh[0] * aec->sxd[i][1] +
+        ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
+
+    sdSum += aec->sd[i];
+    seSum += aec->se[i];
+  }
+
+  // Divergent filter safeguard.
+  aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
+
+  if (aec->divergeState)
+    memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
+
+  // Reset if error is significantly larger than nearend (13 dB).
+  if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
+    memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
+}
+
+// Window time domain data to be used by the fft.
+__inline static void WindowData(float* x_windowed, const float* x) {
+  int i;
+  for (i = 0; i < PART_LEN; i++) {
+    x_windowed[i] = x[i] * WebRtcAec_sqrtHanning[i];
+    x_windowed[PART_LEN + i] =
+        x[PART_LEN + i] * WebRtcAec_sqrtHanning[PART_LEN - i];
+  }
+}
+
+// Puts fft output data into a complex valued array.
+__inline static void StoreAsComplex(const float* data,
+                                    float data_complex[2][PART_LEN1]) {
+  int i;
+  data_complex[0][0] = data[0];
+  data_complex[1][0] = 0;
+  for (i = 1; i < PART_LEN; i++) {
+    data_complex[0][i] = data[2 * i];
+    data_complex[1][i] = data[2 * i + 1];
+  }
+  data_complex[0][PART_LEN] = data[1];
+  data_complex[1][PART_LEN] = 0;
+}
+
+static void SubbandCoherence(AecCore* aec,
+                             float efw[2][PART_LEN1],
+                             float xfw[2][PART_LEN1],
+                             float* fft,
+                             float* cohde,
+                             float* cohxd) {
+  float dfw[2][PART_LEN1];
+  int i;
+
+  if (aec->delayEstCtr == 0)
+    aec->delayIdx = PartitionDelay(aec);
+
+  // Use delayed far.
+  memcpy(xfw,
+         aec->xfwBuf + aec->delayIdx * PART_LEN1,
+         sizeof(xfw[0][0]) * 2 * PART_LEN1);
+
+  // Windowed near fft
+  WindowData(fft, aec->dBuf);
+  aec_rdft_forward_128(fft);
+  StoreAsComplex(fft, dfw);
+
+  // Windowed error fft
+  WindowData(fft, aec->eBuf);
+  aec_rdft_forward_128(fft);
+  StoreAsComplex(fft, efw);
+
+  SmoothedPSD(aec, efw, dfw, xfw);
+
+  // Subband coherence
+  for (i = 0; i < PART_LEN1; i++) {
+    cohde[i] =
+        (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
+        (aec->sd[i] * aec->se[i] + 1e-10f);
+    cohxd[i] =
+        (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
+        (aec->sx[i] * aec->sd[i] + 1e-10f);
+  }
+}
+
+static void GetHighbandGain(const float* lambda, float* nlpGainHband) {
+  int i;
+
+  nlpGainHband[0] = (float)0.0;
+  for (i = freqAvgIc; i < PART_LEN1 - 1; i++) {
+    nlpGainHband[0] += lambda[i];
+  }
+  nlpGainHband[0] /= (float)(PART_LEN1 - 1 - freqAvgIc);
+}
+
+static void ComfortNoise(AecCore* aec,
+                         float efw[2][PART_LEN1],
+                         complex_t* comfortNoiseHband,
+                         const float* noisePow,
+                         const float* lambda) {
+  int i, num;
+  float rand[PART_LEN];
+  float noise, noiseAvg, tmp, tmpAvg;
+  int16_t randW16[PART_LEN];
+  complex_t u[PART_LEN1];
+
+  const float pi2 = 6.28318530717959f;
+
+  // Generate a uniform random array on [0 1]
+  WebRtcSpl_RandUArray(randW16, PART_LEN, &aec->seed);
+  for (i = 0; i < PART_LEN; i++) {
+    rand[i] = ((float)randW16[i]) / 32768;
+  }
+
+  // Reject LF noise
+  u[0][0] = 0;
+  u[0][1] = 0;
+  for (i = 1; i < PART_LEN1; i++) {
+    tmp = pi2 * rand[i - 1];
+
+    noise = sqrtf(noisePow[i]);
+    u[i][0] = noise * cosf(tmp);
+    u[i][1] = -noise * sinf(tmp);
+  }
+  u[PART_LEN][1] = 0;
+
+  for (i = 0; i < PART_LEN1; i++) {
+    // This is the proper weighting to match the background noise power
+    tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
+    // tmp = 1 - lambda[i];
+    efw[0][i] += tmp * u[i][0];
+    efw[1][i] += tmp * u[i][1];
+  }
+
+  // For H band comfort noise
+  // TODO: don't compute noise and "tmp" twice. Use the previous results.
+  noiseAvg = 0.0;
+  tmpAvg = 0.0;
+  num = 0;
+  if (aec->num_bands > 1 && flagHbandCn == 1) {
+
+    // average noise scale
+    // average over second half of freq spectrum (i.e., 4->8khz)
+    // TODO: we shouldn't need num. We know how many elements we're summing.
+    for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
+      num++;
+      noiseAvg += sqrtf(noisePow[i]);
+    }
+    noiseAvg /= (float)num;
+
+    // average nlp scale
+    // average over second half of freq spectrum (i.e., 4->8khz)
+    // TODO: we shouldn't need num. We know how many elements we're summing.
+    num = 0;
+    for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
+      num++;
+      tmpAvg += sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
+    }
+    tmpAvg /= (float)num;
+
+    // Use average noise for H band
+    // TODO: we should probably have a new random vector here.
+    // Reject LF noise
+    u[0][0] = 0;
+    u[0][1] = 0;
+    for (i = 1; i < PART_LEN1; i++) {
+      tmp = pi2 * rand[i - 1];
+
+      // Use average noise for H band
+      u[i][0] = noiseAvg * (float)cos(tmp);
+      u[i][1] = -noiseAvg * (float)sin(tmp);
+    }
+    u[PART_LEN][1] = 0;
+
+    for (i = 0; i < PART_LEN1; i++) {
+      // Use average NLP weight for H band
+      comfortNoiseHband[i][0] = tmpAvg * u[i][0];
+      comfortNoiseHband[i][1] = tmpAvg * u[i][1];
+    }
+  }
+}
+
+static void InitLevel(PowerLevel* level) {
+  const float kBigFloat = 1E17f;
+
+  level->averagelevel = 0;
+  level->framelevel = 0;
+  level->minlevel = kBigFloat;
+  level->frsum = 0;
+  level->sfrsum = 0;
+  level->frcounter = 0;
+  level->sfrcounter = 0;
+}
+
+static void InitStats(Stats* stats) {
+  stats->instant = kOffsetLevel;
+  stats->average = kOffsetLevel;
+  stats->max = kOffsetLevel;
+  stats->min = kOffsetLevel * (-1);
+  stats->sum = 0;
+  stats->hisum = 0;
+  stats->himean = kOffsetLevel;
+  stats->counter = 0;
+  stats->hicounter = 0;
+}
+
+static void InitMetrics(AecCore* self) {
+  self->stateCounter = 0;
+  InitLevel(&self->farlevel);
+  InitLevel(&self->nearlevel);
+  InitLevel(&self->linoutlevel);
+  InitLevel(&self->nlpoutlevel);
+
+  InitStats(&self->erl);
+  InitStats(&self->erle);
+  InitStats(&self->aNlp);
+  InitStats(&self->rerl);
+}
+
+static void UpdateLevel(PowerLevel* level, float in[2][PART_LEN1]) {
+  // Do the energy calculation in the frequency domain. The FFT is performed on
+  // a segment of PART_LEN2 samples due to overlap, but we only want the energy
+  // of half that data (the last PART_LEN samples). Parseval's relation states
+  // that the energy is preserved according to
+  //
+  // \sum_{n=0}^{N-1} |x(n)|^2 = 1/N * \sum_{n=0}^{N-1} |X(n)|^2
+  //                           = ENERGY,
+  //
+  // where N = PART_LEN2. Since we are only interested in calculating the energy
+  // for the last PART_LEN samples we approximate by calculating ENERGY and
+  // divide by 2,
+  //
+  // \sum_{n=N/2}^{N-1} |x(n)|^2 ~= ENERGY / 2
+  //
+  // Since we deal with real valued time domain signals we only store frequency
+  // bins [0, PART_LEN], which is what |in| consists of. To calculate ENERGY we
+  // need to add the contribution from the missing part in
+  // [PART_LEN+1, PART_LEN2-1]. These values are, up to a phase shift, identical
+  // with the values in [1, PART_LEN-1], hence multiply those values by 2. This
+  // is the values in the for loop below, but multiplication by 2 and division
+  // by 2 cancel.
+
+  // TODO(bjornv): Investigate reusing energy calculations performed at other
+  // places in the code.
+  int k = 1;
+  // Imaginary parts are zero at end points and left out of the calculation.
+  float energy = (in[0][0] * in[0][0]) / 2;
+  energy += (in[0][PART_LEN] * in[0][PART_LEN]) / 2;
+
+  for (k = 1; k < PART_LEN; k++) {
+    energy += (in[0][k] * in[0][k] + in[1][k] * in[1][k]);
+  }
+  energy /= PART_LEN2;
+
+  level->sfrsum += energy;
+  level->sfrcounter++;
+
+  if (level->sfrcounter > subCountLen) {
+    level->framelevel = level->sfrsum / (subCountLen * PART_LEN);
+    level->sfrsum = 0;
+    level->sfrcounter = 0;
+    if (level->framelevel > 0) {
+      if (level->framelevel < level->minlevel) {
+        level->minlevel = level->framelevel;  // New minimum.
+      } else {
+        level->minlevel *= (1 + 0.001f);  // Small increase.
+      }
+    }
+    level->frcounter++;
+    level->frsum += level->framelevel;
+    if (level->frcounter > countLen) {
+      level->averagelevel = level->frsum / countLen;
+      level->frsum = 0;
+      level->frcounter = 0;
+    }
+  }
+}
+
+static void UpdateMetrics(AecCore* aec) {
+  float dtmp, dtmp2;
+
+  const float actThresholdNoisy = 8.0f;
+  const float actThresholdClean = 40.0f;
+  const float safety = 0.99995f;
+  const float noisyPower = 300000.0f;
+
+  float actThreshold;
+  float echo, suppressedEcho;
+
+  if (aec->echoState) {  // Check if echo is likely present
+    aec->stateCounter++;
+  }
+
+  if (aec->farlevel.frcounter == 0) {
+
+    if (aec->farlevel.minlevel < noisyPower) {
+      actThreshold = actThresholdClean;
+    } else {
+      actThreshold = actThresholdNoisy;
+    }
+
+    if ((aec->stateCounter > (0.5f * countLen * subCountLen)) &&
+        (aec->farlevel.sfrcounter == 0)
+
+        // Estimate in active far-end segments only
+        &&
+        (aec->farlevel.averagelevel >
+         (actThreshold * aec->farlevel.minlevel))) {
+
+      // Subtract noise power
+      echo = aec->nearlevel.averagelevel - safety * aec->nearlevel.minlevel;
+
+      // ERL
+      dtmp = 10 * (float)log10(aec->farlevel.averagelevel /
+                                   aec->nearlevel.averagelevel +
+                               1e-10f);
+      dtmp2 = 10 * (float)log10(aec->farlevel.averagelevel / echo + 1e-10f);
+
+      aec->erl.instant = dtmp;
+      if (dtmp > aec->erl.max) {
+        aec->erl.max = dtmp;
+      }
+
+      if (dtmp < aec->erl.min) {
+        aec->erl.min = dtmp;
+      }
+
+      aec->erl.counter++;
+      aec->erl.sum += dtmp;
+      aec->erl.average = aec->erl.sum / aec->erl.counter;
+
+      // Upper mean
+      if (dtmp > aec->erl.average) {
+        aec->erl.hicounter++;
+        aec->erl.hisum += dtmp;
+        aec->erl.himean = aec->erl.hisum / aec->erl.hicounter;
+      }
+
+      // A_NLP
+      dtmp = 10 * (float)log10(aec->nearlevel.averagelevel /
+                                   (2 * aec->linoutlevel.averagelevel) +
+                               1e-10f);
+
+      // subtract noise power
+      suppressedEcho = 2 * (aec->linoutlevel.averagelevel -
+                            safety * aec->linoutlevel.minlevel);
+
+      dtmp2 = 10 * (float)log10(echo / suppressedEcho + 1e-10f);
+
+      aec->aNlp.instant = dtmp2;
+      if (dtmp > aec->aNlp.max) {
+        aec->aNlp.max = dtmp;
+      }
+
+      if (dtmp < aec->aNlp.min) {
+        aec->aNlp.min = dtmp;
+      }
+
+      aec->aNlp.counter++;
+      aec->aNlp.sum += dtmp;
+      aec->aNlp.average = aec->aNlp.sum / aec->aNlp.counter;
+
+      // Upper mean
+      if (dtmp > aec->aNlp.average) {
+        aec->aNlp.hicounter++;
+        aec->aNlp.hisum += dtmp;
+        aec->aNlp.himean = aec->aNlp.hisum / aec->aNlp.hicounter;
+      }
+
+      // ERLE
+
+      // subtract noise power
+      suppressedEcho = 2 * (aec->nlpoutlevel.averagelevel -
+                            safety * aec->nlpoutlevel.minlevel);
+
+      dtmp = 10 * (float)log10(aec->nearlevel.averagelevel /
+                                   (2 * aec->nlpoutlevel.averagelevel) +
+                               1e-10f);
+      dtmp2 = 10 * (float)log10(echo / suppressedEcho + 1e-10f);
+
+      dtmp = dtmp2;
+      aec->erle.instant = dtmp;
+      if (dtmp > aec->erle.max) {
+        aec->erle.max = dtmp;
+      }
+
+      if (dtmp < aec->erle.min) {
+        aec->erle.min = dtmp;
+      }
+
+      aec->erle.counter++;
+      aec->erle.sum += dtmp;
+      aec->erle.average = aec->erle.sum / aec->erle.counter;
+
+      // Upper mean
+      if (dtmp > aec->erle.average) {
+        aec->erle.hicounter++;
+        aec->erle.hisum += dtmp;
+        aec->erle.himean = aec->erle.hisum / aec->erle.hicounter;
+      }
+    }
+
+    aec->stateCounter = 0;
+  }
+}
+
+static void UpdateDelayMetrics(AecCore* self) {
+  int i = 0;
+  int delay_values = 0;
+  int median = 0;
+  int lookahead = WebRtc_lookahead(self->delay_estimator);
+  const int kMsPerBlock = PART_LEN / (self->mult * 8);
+  int64_t l1_norm = 0;
+
+  if (self->num_delay_values == 0) {
+    // We have no new delay value data. Even though -1 is a valid |median| in
+    // the sense that we allow negative values, it will practically never be
+    // used since multiples of |kMsPerBlock| will always be returned.
+    // We therefore use -1 to indicate in the logs that the delay estimator was
+    // not able to estimate the delay.
+    self->delay_median = -1;
+    self->delay_std = -1;
+    self->fraction_poor_delays = -1;
+    return;
+  }
+
+  // Start value for median count down.
+  delay_values = self->num_delay_values >> 1;
+  // Get median of delay values since last update.
+  for (i = 0; i < kHistorySizeBlocks; i++) {
+    delay_values -= self->delay_histogram[i];
+    if (delay_values < 0) {
+      median = i;
+      break;
+    }
+  }
+  // Account for lookahead.
+  self->delay_median = (median - lookahead) * kMsPerBlock;
+
+  // Calculate the L1 norm, with median value as central moment.
+  for (i = 0; i < kHistorySizeBlocks; i++) {
+    l1_norm += abs(i - median) * self->delay_histogram[i];
+  }
+  self->delay_std = (int)((l1_norm + self->num_delay_values / 2) /
+      self->num_delay_values) * kMsPerBlock;
+
+  // Determine fraction of delays that are out of bounds, that is, either
+  // negative (anti-causal system) or larger than the AEC filter length.
+  {
+    int num_delays_out_of_bounds = self->num_delay_values;
+    const int histogram_length = sizeof(self->delay_histogram) /
+      sizeof(self->delay_histogram[0]);
+    for (i = lookahead; i < lookahead + self->num_partitions; ++i) {
+      if (i < histogram_length)
+        num_delays_out_of_bounds -= self->delay_histogram[i];
+    }
+    self->fraction_poor_delays = (float)num_delays_out_of_bounds /
+        self->num_delay_values;
+  }
+
+  // Reset histogram.
+  memset(self->delay_histogram, 0, sizeof(self->delay_histogram));
+  self->num_delay_values = 0;
+
+  return;
+}
+
+static void TimeToFrequency(float time_data[PART_LEN2],
+                            float freq_data[2][PART_LEN1],
+                            int window) {
+  int i = 0;
+
+  // TODO(bjornv): Should we have a different function/wrapper for windowed FFT?
+  if (window) {
+    for (i = 0; i < PART_LEN; i++) {
+      time_data[i] *= WebRtcAec_sqrtHanning[i];
+      time_data[PART_LEN + i] *= WebRtcAec_sqrtHanning[PART_LEN - i];
+    }
+  }
+
+  aec_rdft_forward_128(time_data);
+  // Reorder.
+  freq_data[1][0] = 0;
+  freq_data[1][PART_LEN] = 0;
+  freq_data[0][0] = time_data[0];
+  freq_data[0][PART_LEN] = time_data[1];
+  for (i = 1; i < PART_LEN; i++) {
+    freq_data[0][i] = time_data[2 * i];
+    freq_data[1][i] = time_data[2 * i + 1];
+  }
+}
+
+static int MoveFarReadPtrWithoutSystemDelayUpdate(AecCore* self, int elements) {
+  WebRtc_MoveReadPtr(self->far_buf_windowed, elements);
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  WebRtc_MoveReadPtr(self->far_time_buf, elements);
+#endif
+  return WebRtc_MoveReadPtr(self->far_buf, elements);
+}
+
+static int SignalBasedDelayCorrection(AecCore* self) {
+  int delay_correction = 0;
+  int last_delay = -2;
+  assert(self != NULL);
+#if !defined(WEBRTC_ANDROID)
+  // On desktops, turn on correction after |kDelayCorrectionStart| frames.  This
+  // is to let the delay estimation get a chance to converge.  Also, if the
+  // playout audio volume is low (or even muted) the delay estimation can return
+  // a very large delay, which will break the AEC if it is applied.
+  if (self->frame_count < kDelayCorrectionStart) {
+    return 0;
+  }
+#endif
+
+  // 1. Check for non-negative delay estimate.  Note that the estimates we get
+  //    from the delay estimation are not compensated for lookahead.  Hence, a
+  //    negative |last_delay| is an invalid one.
+  // 2. Verify that there is a delay change.  In addition, only allow a change
+  //    if the delay is outside a certain region taking the AEC filter length
+  //    into account.
+  // TODO(bjornv): Investigate if we can remove the non-zero delay change check.
+  // 3. Only allow delay correction if the delay estimation quality exceeds
+  //    |delay_quality_threshold|.
+  // 4. Finally, verify that the proposed |delay_correction| is feasible by
+  //    comparing with the size of the far-end buffer.
+  last_delay = WebRtc_last_delay(self->delay_estimator);
+  if ((last_delay >= 0) &&
+      (last_delay != self->previous_delay) &&
+      (WebRtc_last_delay_quality(self->delay_estimator) >
+           self->delay_quality_threshold)) {
+    int delay = last_delay - WebRtc_lookahead(self->delay_estimator);
+    // Allow for a slack in the actual delay, defined by a |lower_bound| and an
+    // |upper_bound|.  The adaptive echo cancellation filter is currently
+    // |num_partitions| (of 64 samples) long.  If the delay estimate is negative
+    // or at least 3/4 of the filter length we open up for correction.
+    const int lower_bound = 0;
+    const int upper_bound = self->num_partitions * 3 / 4;
+    const int do_correction = delay <= lower_bound || delay > upper_bound;
+    if (do_correction == 1) {
+      int available_read = (int)WebRtc_available_read(self->far_buf);
+      // With |shift_offset| we gradually rely on the delay estimates.  For
+      // positive delays we reduce the correction by |shift_offset| to lower the
+      // risk of pushing the AEC into a non causal state.  For negative delays
+      // we rely on the values up to a rounding error, hence compensate by 1
+      // element to make sure to push the delay into the causal region.
+      delay_correction = -delay;
+      delay_correction += delay > self->shift_offset ? self->shift_offset : 1;
+      self->shift_offset--;
+      self->shift_offset = (self->shift_offset <= 1 ? 1 : self->shift_offset);
+      if (delay_correction > available_read - self->mult - 1) {
+        // There is not enough data in the buffer to perform this shift.  Hence,
+        // we do not rely on the delay estimate and do nothing.
+        delay_correction = 0;
+      } else {
+        self->previous_delay = last_delay;
+        ++self->delay_correction_count;
+      }
+    }
+  }
+  // Update the |delay_quality_threshold| once we have our first delay
+  // correction.
+  if (self->delay_correction_count > 0) {
+    float delay_quality = WebRtc_last_delay_quality(self->delay_estimator);
+    delay_quality = (delay_quality > kDelayQualityThresholdMax ?
+        kDelayQualityThresholdMax : delay_quality);
+    self->delay_quality_threshold =
+        (delay_quality > self->delay_quality_threshold ? delay_quality :
+            self->delay_quality_threshold);
+  }
+  return delay_correction;
+}
+
+static void NonLinearProcessing(AecCore* aec,
+                                float* output,
+                                float* const* outputH) {
+  float efw[2][PART_LEN1], xfw[2][PART_LEN1];
+  complex_t comfortNoiseHband[PART_LEN1];
+  float fft[PART_LEN2];
+  float scale, dtmp;
+  float nlpGainHband;
+  int i;
+  size_t j;
+
+  // Coherence and non-linear filter
+  float cohde[PART_LEN1], cohxd[PART_LEN1];
+  float hNlDeAvg, hNlXdAvg;
+  float hNl[PART_LEN1];
+  float hNlPref[kPrefBandSize];
+  float hNlFb = 0, hNlFbLow = 0;
+  const float prefBandQuant = 0.75f, prefBandQuantLow = 0.5f;
+  const int prefBandSize = kPrefBandSize / aec->mult;
+  const int minPrefBand = 4 / aec->mult;
+  // Power estimate smoothing coefficients.
+  const float* min_overdrive = aec->extended_filter_enabled
+                                   ? kExtendedMinOverDrive
+                                   : kNormalMinOverDrive;
+
+  // Filter energy
+  const int delayEstInterval = 10 * aec->mult;
+
+  float* xfw_ptr = NULL;
+
+  aec->delayEstCtr++;
+  if (aec->delayEstCtr == delayEstInterval) {
+    aec->delayEstCtr = 0;
+  }
+
+  // initialize comfort noise for H band
+  memset(comfortNoiseHband, 0, sizeof(comfortNoiseHband));
+  nlpGainHband = (float)0.0;
+  dtmp = (float)0.0;
+
+  // We should always have at least one element stored in |far_buf|.
+  assert(WebRtc_available_read(aec->far_buf_windowed) > 0);
+  // NLP
+  WebRtc_ReadBuffer(aec->far_buf_windowed, (void**)&xfw_ptr, &xfw[0][0], 1);
+
+  // TODO(bjornv): Investigate if we can reuse |far_buf_windowed| instead of
+  // |xfwBuf|.
+  // Buffer far.
+  memcpy(aec->xfwBuf, xfw_ptr, sizeof(float) * 2 * PART_LEN1);
+
+  WebRtcAec_SubbandCoherence(aec, efw, xfw, fft, cohde, cohxd);
+
+  hNlXdAvg = 0;
+  for (i = minPrefBand; i < prefBandSize + minPrefBand; i++) {
+    hNlXdAvg += cohxd[i];
+  }
+  hNlXdAvg /= prefBandSize;
+  hNlXdAvg = 1 - hNlXdAvg;
+
+  hNlDeAvg = 0;
+  for (i = minPrefBand; i < prefBandSize + minPrefBand; i++) {
+    hNlDeAvg += cohde[i];
+  }
+  hNlDeAvg /= prefBandSize;
+
+  if (hNlXdAvg < 0.75f && hNlXdAvg < aec->hNlXdAvgMin) {
+    aec->hNlXdAvgMin = hNlXdAvg;
+  }
+
+  if (hNlDeAvg > 0.98f && hNlXdAvg > 0.9f) {
+    aec->stNearState = 1;
+  } else if (hNlDeAvg < 0.95f || hNlXdAvg < 0.8f) {
+    aec->stNearState = 0;
+  }
+
+  if (aec->hNlXdAvgMin == 1) {
+    aec->echoState = 0;
+    aec->overDrive = min_overdrive[aec->nlp_mode];
+
+    if (aec->stNearState == 1) {
+      memcpy(hNl, cohde, sizeof(hNl));
+      hNlFb = hNlDeAvg;
+      hNlFbLow = hNlDeAvg;
+    } else {
+      for (i = 0; i < PART_LEN1; i++) {
+        hNl[i] = 1 - cohxd[i];
+      }
+      hNlFb = hNlXdAvg;
+      hNlFbLow = hNlXdAvg;
+    }
+  } else {
+
+    if (aec->stNearState == 1) {
+      aec->echoState = 0;
+      memcpy(hNl, cohde, sizeof(hNl));
+      hNlFb = hNlDeAvg;
+      hNlFbLow = hNlDeAvg;
+    } else {
+      aec->echoState = 1;
+      for (i = 0; i < PART_LEN1; i++) {
+        hNl[i] = WEBRTC_SPL_MIN(cohde[i], 1 - cohxd[i]);
+      }
+
+      // Select an order statistic from the preferred bands.
+      // TODO: Using quicksort now, but a selection algorithm may be preferred.
+      memcpy(hNlPref, &hNl[minPrefBand], sizeof(float) * prefBandSize);
+      qsort(hNlPref, prefBandSize, sizeof(float), CmpFloat);
+      hNlFb = hNlPref[(int)floor(prefBandQuant * (prefBandSize - 1))];
+      hNlFbLow = hNlPref[(int)floor(prefBandQuantLow * (prefBandSize - 1))];
+    }
+  }
+
+  // Track the local filter minimum to determine suppression overdrive.
+  if (hNlFbLow < 0.6f && hNlFbLow < aec->hNlFbLocalMin) {
+    aec->hNlFbLocalMin = hNlFbLow;
+    aec->hNlFbMin = hNlFbLow;
+    aec->hNlNewMin = 1;
+    aec->hNlMinCtr = 0;
+  }
+  aec->hNlFbLocalMin =
+      WEBRTC_SPL_MIN(aec->hNlFbLocalMin + 0.0008f / aec->mult, 1);
+  aec->hNlXdAvgMin = WEBRTC_SPL_MIN(aec->hNlXdAvgMin + 0.0006f / aec->mult, 1);
+
+  if (aec->hNlNewMin == 1) {
+    aec->hNlMinCtr++;
+  }
+  if (aec->hNlMinCtr == 2) {
+    aec->hNlNewMin = 0;
+    aec->hNlMinCtr = 0;
+    aec->overDrive =
+        WEBRTC_SPL_MAX(kTargetSupp[aec->nlp_mode] /
+                           ((float)log(aec->hNlFbMin + 1e-10f) + 1e-10f),
+                       min_overdrive[aec->nlp_mode]);
+  }
+
+  // Smooth the overdrive.
+  if (aec->overDrive < aec->overDriveSm) {
+    aec->overDriveSm = 0.99f * aec->overDriveSm + 0.01f * aec->overDrive;
+  } else {
+    aec->overDriveSm = 0.9f * aec->overDriveSm + 0.1f * aec->overDrive;
+  }
+
+  WebRtcAec_OverdriveAndSuppress(aec, hNl, hNlFb, efw);
+
+  // Add comfort noise.
+  WebRtcAec_ComfortNoise(aec, efw, comfortNoiseHband, aec->noisePow, hNl);
+
+  // TODO(bjornv): Investigate how to take the windowing below into account if
+  // needed.
+  if (aec->metricsMode == 1) {
+    // Note that we have a scaling by two in the time domain |eBuf|.
+    // In addition the time domain signal is windowed before transformation,
+    // losing half the energy on the average. We take care of the first
+    // scaling only in UpdateMetrics().
+    UpdateLevel(&aec->nlpoutlevel, efw);
+  }
+  // Inverse error fft.
+  fft[0] = efw[0][0];
+  fft[1] = efw[0][PART_LEN];
+  for (i = 1; i < PART_LEN; i++) {
+    fft[2 * i] = efw[0][i];
+    // Sign change required by Ooura fft.
+    fft[2 * i + 1] = -efw[1][i];
+  }
+  aec_rdft_inverse_128(fft);
+
+  // Overlap and add to obtain output.
+  scale = 2.0f / PART_LEN2;
+  for (i = 0; i < PART_LEN; i++) {
+    fft[i] *= scale;  // fft scaling
+    fft[i] = fft[i] * WebRtcAec_sqrtHanning[i] + aec->outBuf[i];
+
+    fft[PART_LEN + i] *= scale;  // fft scaling
+    aec->outBuf[i] = fft[PART_LEN + i] * WebRtcAec_sqrtHanning[PART_LEN - i];
+
+    // Saturate output to keep it in the allowed range.
+    output[i] = WEBRTC_SPL_SAT(
+        WEBRTC_SPL_WORD16_MAX, fft[i], WEBRTC_SPL_WORD16_MIN);
+  }
+
+  // For H band
+  if (aec->num_bands > 1) {
+
+    // H band gain
+    // average nlp over low band: average over second half of freq spectrum
+    // (4->8khz)
+    GetHighbandGain(hNl, &nlpGainHband);
+
+    // Inverse comfort_noise
+    if (flagHbandCn == 1) {
+      fft[0] = comfortNoiseHband[0][0];
+      fft[1] = comfortNoiseHband[PART_LEN][0];
+      for (i = 1; i < PART_LEN; i++) {
+        fft[2 * i] = comfortNoiseHband[i][0];
+        fft[2 * i + 1] = comfortNoiseHband[i][1];
+      }
+      aec_rdft_inverse_128(fft);
+      scale = 2.0f / PART_LEN2;
+    }
+
+    // compute gain factor
+    for (j = 0; j < aec->num_bands - 1; ++j) {
+      for (i = 0; i < PART_LEN; i++) {
+        dtmp = aec->dBufH[j][i];
+        dtmp = dtmp * nlpGainHband;  // for variable gain
+
+        // add some comfort noise where Hband is attenuated
+        if (flagHbandCn == 1 && j == 0) {
+          fft[i] *= scale;  // fft scaling
+          dtmp += cnScaleHband * fft[i];
+        }
+
+        // Saturate output to keep it in the allowed range.
+        outputH[j][i] = WEBRTC_SPL_SAT(
+            WEBRTC_SPL_WORD16_MAX, dtmp, WEBRTC_SPL_WORD16_MIN);
+      }
+    }
+  }
+
+  // Copy the current block to the old position.
+  memcpy(aec->dBuf, aec->dBuf + PART_LEN, sizeof(float) * PART_LEN);
+  memcpy(aec->eBuf, aec->eBuf + PART_LEN, sizeof(float) * PART_LEN);
+
+  // Copy the current block to the old position for H band
+  for (j = 0; j < aec->num_bands - 1; ++j) {
+    memcpy(aec->dBufH[j], aec->dBufH[j] + PART_LEN, sizeof(float) * PART_LEN);
+  }
+
+  memmove(aec->xfwBuf + PART_LEN1,
+          aec->xfwBuf,
+          sizeof(aec->xfwBuf) - sizeof(complex_t) * PART_LEN1);
+}
+
+static void ProcessBlock(AecCore* aec) {
+  size_t i;
+  float y[PART_LEN], e[PART_LEN];
+  float scale;
+
+  float fft[PART_LEN2];
+  float xf[2][PART_LEN1], yf[2][PART_LEN1], ef[2][PART_LEN1];
+  float df[2][PART_LEN1];
+  float far_spectrum = 0.0f;
+  float near_spectrum = 0.0f;
+  float abs_far_spectrum[PART_LEN1];
+  float abs_near_spectrum[PART_LEN1];
+
+  const float gPow[2] = {0.9f, 0.1f};
+
+  // Noise estimate constants.
+  const int noiseInitBlocks = 500 * aec->mult;
+  const float step = 0.1f;
+  const float ramp = 1.0002f;
+  const float gInitNoise[2] = {0.999f, 0.001f};
+
+  float nearend[PART_LEN];
+  float* nearend_ptr = NULL;
+  float output[PART_LEN];
+  float outputH[NUM_HIGH_BANDS_MAX][PART_LEN];
+  float* outputH_ptr[NUM_HIGH_BANDS_MAX];
+  for (i = 0; i < NUM_HIGH_BANDS_MAX; ++i) {
+    outputH_ptr[i] = outputH[i];
+  }
+
+  float* xf_ptr = NULL;
+
+  // Concatenate old and new nearend blocks.
+  for (i = 0; i < aec->num_bands - 1; ++i) {
+    WebRtc_ReadBuffer(aec->nearFrBufH[i],
+                      (void**)&nearend_ptr,
+                      nearend,
+                      PART_LEN);
+    memcpy(aec->dBufH[i] + PART_LEN, nearend_ptr, sizeof(nearend));
+  }
+  WebRtc_ReadBuffer(aec->nearFrBuf, (void**)&nearend_ptr, nearend, PART_LEN);
+  memcpy(aec->dBuf + PART_LEN, nearend_ptr, sizeof(nearend));
+
+  // ---------- Ooura fft ----------
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  {
+    float farend[PART_LEN];
+    float* farend_ptr = NULL;
+    WebRtc_ReadBuffer(aec->far_time_buf, (void**)&farend_ptr, farend, 1);
+    RTC_AEC_DEBUG_WAV_WRITE(aec->farFile, farend_ptr, PART_LEN);
+    RTC_AEC_DEBUG_WAV_WRITE(aec->nearFile, nearend_ptr, PART_LEN);
+  }
+#endif
+
+  // We should always have at least one element stored in |far_buf|.
+  assert(WebRtc_available_read(aec->far_buf) > 0);
+  WebRtc_ReadBuffer(aec->far_buf, (void**)&xf_ptr, &xf[0][0], 1);
+
+  // Near fft
+  memcpy(fft, aec->dBuf, sizeof(float) * PART_LEN2);
+  TimeToFrequency(fft, df, 0);
+
+  // Power smoothing
+  for (i = 0; i < PART_LEN1; i++) {
+    far_spectrum = (xf_ptr[i] * xf_ptr[i]) +
+                   (xf_ptr[PART_LEN1 + i] * xf_ptr[PART_LEN1 + i]);
+    aec->xPow[i] =
+        gPow[0] * aec->xPow[i] + gPow[1] * aec->num_partitions * far_spectrum;
+    // Calculate absolute spectra
+    abs_far_spectrum[i] = sqrtf(far_spectrum);
+
+    near_spectrum = df[0][i] * df[0][i] + df[1][i] * df[1][i];
+    aec->dPow[i] = gPow[0] * aec->dPow[i] + gPow[1] * near_spectrum;
+    // Calculate absolute spectra
+    abs_near_spectrum[i] = sqrtf(near_spectrum);
+  }
+
+  // Estimate noise power. Wait until dPow is more stable.
+  if (aec->noiseEstCtr > 50) {
+    for (i = 0; i < PART_LEN1; i++) {
+      if (aec->dPow[i] < aec->dMinPow[i]) {
+        aec->dMinPow[i] =
+            (aec->dPow[i] + step * (aec->dMinPow[i] - aec->dPow[i])) * ramp;
+      } else {
+        aec->dMinPow[i] *= ramp;
+      }
+    }
+  }
+
+  // Smooth increasing noise power from zero at the start,
+  // to avoid a sudden burst of comfort noise.
+  if (aec->noiseEstCtr < noiseInitBlocks) {
+    aec->noiseEstCtr++;
+    for (i = 0; i < PART_LEN1; i++) {
+      if (aec->dMinPow[i] > aec->dInitMinPow[i]) {
+        aec->dInitMinPow[i] = gInitNoise[0] * aec->dInitMinPow[i] +
+                              gInitNoise[1] * aec->dMinPow[i];
+      } else {
+        aec->dInitMinPow[i] = aec->dMinPow[i];
+      }
+    }
+    aec->noisePow = aec->dInitMinPow;
+  } else {
+    aec->noisePow = aec->dMinPow;
+  }
+
+  // Block wise delay estimation used for logging
+  if (aec->delay_logging_enabled) {
+    if (WebRtc_AddFarSpectrumFloat(
+            aec->delay_estimator_farend, abs_far_spectrum, PART_LEN1) == 0) {
+      int delay_estimate = WebRtc_DelayEstimatorProcessFloat(
+          aec->delay_estimator, abs_near_spectrum, PART_LEN1);
+      if (delay_estimate >= 0) {
+        // Update delay estimate buffer.
+        aec->delay_histogram[delay_estimate]++;
+        aec->num_delay_values++;
+      }
+      if (aec->delay_metrics_delivered == 1 &&
+          aec->num_delay_values >= kDelayMetricsAggregationWindow) {
+        UpdateDelayMetrics(aec);
+      }
+    }
+  }
+
+  // Update the xfBuf block position.
+  aec->xfBufBlockPos--;
+  if (aec->xfBufBlockPos == -1) {
+    aec->xfBufBlockPos = aec->num_partitions - 1;
+  }
+
+  // Buffer xf
+  memcpy(aec->xfBuf[0] + aec->xfBufBlockPos * PART_LEN1,
+         xf_ptr,
+         sizeof(float) * PART_LEN1);
+  memcpy(aec->xfBuf[1] + aec->xfBufBlockPos * PART_LEN1,
+         &xf_ptr[PART_LEN1],
+         sizeof(float) * PART_LEN1);
+
+  memset(yf, 0, sizeof(yf));
+
+  // Filter far
+  WebRtcAec_FilterFar(aec, yf);
+
+  // Inverse fft to obtain echo estimate and error.
+  fft[0] = yf[0][0];
+  fft[1] = yf[0][PART_LEN];
+  for (i = 1; i < PART_LEN; i++) {
+    fft[2 * i] = yf[0][i];
+    fft[2 * i + 1] = yf[1][i];
+  }
+  aec_rdft_inverse_128(fft);
+
+  scale = 2.0f / PART_LEN2;
+  for (i = 0; i < PART_LEN; i++) {
+    y[i] = fft[PART_LEN + i] * scale;  // fft scaling
+  }
+
+  for (i = 0; i < PART_LEN; i++) {
+    e[i] = nearend_ptr[i] - y[i];
+  }
+
+  // Error fft
+  memcpy(aec->eBuf + PART_LEN, e, sizeof(float) * PART_LEN);
+  memset(fft, 0, sizeof(float) * PART_LEN);
+  memcpy(fft + PART_LEN, e, sizeof(float) * PART_LEN);
+  // TODO(bjornv): Change to use TimeToFrequency().
+  aec_rdft_forward_128(fft);
+
+  ef[1][0] = 0;
+  ef[1][PART_LEN] = 0;
+  ef[0][0] = fft[0];
+  ef[0][PART_LEN] = fft[1];
+  for (i = 1; i < PART_LEN; i++) {
+    ef[0][i] = fft[2 * i];
+    ef[1][i] = fft[2 * i + 1];
+  }
+
+  RTC_AEC_DEBUG_RAW_WRITE(aec->e_fft_file,
+                          &ef[0][0],
+                          sizeof(ef[0][0]) * PART_LEN1 * 2);
+
+  if (aec->metricsMode == 1) {
+    // Note that the first PART_LEN samples in fft (before transformation) are
+    // zero. Hence, the scaling by two in UpdateLevel() should not be
+    // performed. That scaling is taken care of in UpdateMetrics() instead.
+    UpdateLevel(&aec->linoutlevel, ef);
+  }
+
+  // Scale error signal inversely with far power.
+  WebRtcAec_ScaleErrorSignal(aec, ef);
+  WebRtcAec_FilterAdaptation(aec, fft, ef);
+  NonLinearProcessing(aec, output, outputH_ptr);
+
+  if (aec->metricsMode == 1) {
+    // Update power levels and echo metrics
+    UpdateLevel(&aec->farlevel, (float(*)[PART_LEN1])xf_ptr);
+    UpdateLevel(&aec->nearlevel, df);
+    UpdateMetrics(aec);
+  }
+
+  // Store the output block.
+  WebRtc_WriteBuffer(aec->outFrBuf, output, PART_LEN);
+  // For high bands
+  for (i = 0; i < aec->num_bands - 1; ++i) {
+    WebRtc_WriteBuffer(aec->outFrBufH[i], outputH[i], PART_LEN);
+  }
+
+  RTC_AEC_DEBUG_WAV_WRITE(aec->outLinearFile, e, PART_LEN);
+  RTC_AEC_DEBUG_WAV_WRITE(aec->outFile, output, PART_LEN);
+}
+
+AecCore* WebRtcAec_CreateAec() {
+  int i;
+  AecCore* aec = malloc(sizeof(AecCore));
+  if (!aec) {
+    return NULL;
+  }
+
+  aec->nearFrBuf = WebRtc_CreateBuffer(FRAME_LEN + PART_LEN, sizeof(float));
+  if (!aec->nearFrBuf) {
+    WebRtcAec_FreeAec(aec);
+    return NULL;
+  }
+
+  aec->outFrBuf = WebRtc_CreateBuffer(FRAME_LEN + PART_LEN, sizeof(float));
+  if (!aec->outFrBuf) {
+    WebRtcAec_FreeAec(aec);
+    return NULL;
+  }
+
+  for (i = 0; i < NUM_HIGH_BANDS_MAX; ++i) {
+    aec->nearFrBufH[i] = WebRtc_CreateBuffer(FRAME_LEN + PART_LEN,
+                                             sizeof(float));
+    if (!aec->nearFrBufH[i]) {
+      WebRtcAec_FreeAec(aec);
+      return NULL;
+    }
+    aec->outFrBufH[i] = WebRtc_CreateBuffer(FRAME_LEN + PART_LEN,
+                                            sizeof(float));
+    if (!aec->outFrBufH[i]) {
+      WebRtcAec_FreeAec(aec);
+      return NULL;
+    }
+  }
+
+  // Create far-end buffers.
+  aec->far_buf =
+      WebRtc_CreateBuffer(kBufSizePartitions, sizeof(float) * 2 * PART_LEN1);
+  if (!aec->far_buf) {
+    WebRtcAec_FreeAec(aec);
+    return NULL;
+  }
+  aec->far_buf_windowed =
+      WebRtc_CreateBuffer(kBufSizePartitions, sizeof(float) * 2 * PART_LEN1);
+  if (!aec->far_buf_windowed) {
+    WebRtcAec_FreeAec(aec);
+    return NULL;
+  }
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  aec->instance_index = webrtc_aec_instance_count;
+  aec->far_time_buf =
+      WebRtc_CreateBuffer(kBufSizePartitions, sizeof(float) * PART_LEN);
+  if (!aec->far_time_buf) {
+    WebRtcAec_FreeAec(aec);
+    return NULL;
+  }
+  aec->farFile = aec->nearFile = aec->outFile = aec->outLinearFile = NULL;
+  aec->debug_dump_count = 0;
+#endif
+  aec->delay_estimator_farend =
+      WebRtc_CreateDelayEstimatorFarend(PART_LEN1, kHistorySizeBlocks);
+  if (aec->delay_estimator_farend == NULL) {
+    WebRtcAec_FreeAec(aec);
+    return NULL;
+  }
+  // We create the delay_estimator with the same amount of maximum lookahead as
+  // the delay history size (kHistorySizeBlocks) for symmetry reasons.
+  aec->delay_estimator = WebRtc_CreateDelayEstimator(
+      aec->delay_estimator_farend, kHistorySizeBlocks);
+  if (aec->delay_estimator == NULL) {
+    WebRtcAec_FreeAec(aec);
+    return NULL;
+  }
+#ifdef WEBRTC_ANDROID
+  aec->delay_agnostic_enabled = 1;  // DA-AEC enabled by default.
+  // DA-AEC assumes the system is causal from the beginning and will self adjust
+  // the lookahead when shifting is required.
+  WebRtc_set_lookahead(aec->delay_estimator, 0);
+#else
+  aec->delay_agnostic_enabled = 0;
+  WebRtc_set_lookahead(aec->delay_estimator, kLookaheadBlocks);
+#endif
+  aec->extended_filter_enabled = 0;
+
+  // Assembly optimization
+  WebRtcAec_FilterFar = FilterFar;
+  WebRtcAec_ScaleErrorSignal = ScaleErrorSignal;
+  WebRtcAec_FilterAdaptation = FilterAdaptation;
+  WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress;
+  WebRtcAec_ComfortNoise = ComfortNoise;
+  WebRtcAec_SubbandCoherence = SubbandCoherence;
+
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+  if (WebRtc_GetCPUInfo(kSSE2)) {
+    WebRtcAec_InitAec_SSE2();
+  }
+#endif
+
+#if defined(MIPS_FPU_LE)
+  WebRtcAec_InitAec_mips();
+#endif
+
+#if defined(WEBRTC_HAS_NEON)
+  WebRtcAec_InitAec_neon();
+#elif defined(WEBRTC_DETECT_NEON)
+  if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
+    WebRtcAec_InitAec_neon();
+  }
+#endif
+
+  aec_rdft_init();
+
+  return aec;
+}
+
+void WebRtcAec_FreeAec(AecCore* aec) {
+  int i;
+  if (aec == NULL) {
+    return;
+  }
+
+  WebRtc_FreeBuffer(aec->nearFrBuf);
+  WebRtc_FreeBuffer(aec->outFrBuf);
+
+  for (i = 0; i < NUM_HIGH_BANDS_MAX; ++i) {
+    WebRtc_FreeBuffer(aec->nearFrBufH[i]);
+    WebRtc_FreeBuffer(aec->outFrBufH[i]);
+  }
+
+  WebRtc_FreeBuffer(aec->far_buf);
+  WebRtc_FreeBuffer(aec->far_buf_windowed);
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  WebRtc_FreeBuffer(aec->far_time_buf);
+#endif
+  RTC_AEC_DEBUG_WAV_CLOSE(aec->farFile);
+  RTC_AEC_DEBUG_WAV_CLOSE(aec->nearFile);
+  RTC_AEC_DEBUG_WAV_CLOSE(aec->outFile);
+  RTC_AEC_DEBUG_WAV_CLOSE(aec->outLinearFile);
+  RTC_AEC_DEBUG_RAW_CLOSE(aec->e_fft_file);
+
+  WebRtc_FreeDelayEstimator(aec->delay_estimator);
+  WebRtc_FreeDelayEstimatorFarend(aec->delay_estimator_farend);
+
+  free(aec);
+}
+
+int WebRtcAec_InitAec(AecCore* aec, int sampFreq) {
+  int i;
+
+  aec->sampFreq = sampFreq;
+
+  if (sampFreq == 8000) {
+    aec->normal_mu = 0.6f;
+    aec->normal_error_threshold = 2e-6f;
+    aec->num_bands = 1;
+  } else {
+    aec->normal_mu = 0.5f;
+    aec->normal_error_threshold = 1.5e-6f;
+    aec->num_bands = (size_t)(sampFreq / 16000);
+  }
+
+  WebRtc_InitBuffer(aec->nearFrBuf);
+  WebRtc_InitBuffer(aec->outFrBuf);
+  for (i = 0; i < NUM_HIGH_BANDS_MAX; ++i) {
+    WebRtc_InitBuffer(aec->nearFrBufH[i]);
+    WebRtc_InitBuffer(aec->outFrBufH[i]);
+  }
+
+  // Initialize far-end buffers.
+  WebRtc_InitBuffer(aec->far_buf);
+  WebRtc_InitBuffer(aec->far_buf_windowed);
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  WebRtc_InitBuffer(aec->far_time_buf);
+  {
+    int process_rate = sampFreq > 16000 ? 16000 : sampFreq;
+    RTC_AEC_DEBUG_WAV_REOPEN("aec_far", aec->instance_index,
+                             aec->debug_dump_count, process_rate,
+                             &aec->farFile );
+    RTC_AEC_DEBUG_WAV_REOPEN("aec_near", aec->instance_index,
+                             aec->debug_dump_count, process_rate,
+                             &aec->nearFile);
+    RTC_AEC_DEBUG_WAV_REOPEN("aec_out", aec->instance_index,
+                             aec->debug_dump_count, process_rate,
+                             &aec->outFile );
+    RTC_AEC_DEBUG_WAV_REOPEN("aec_out_linear", aec->instance_index,
+                             aec->debug_dump_count, process_rate,
+                             &aec->outLinearFile);
+  }
+
+  RTC_AEC_DEBUG_RAW_OPEN("aec_e_fft",
+                         aec->debug_dump_count,
+                         &aec->e_fft_file);
+
+  ++aec->debug_dump_count;
+#endif
+  aec->system_delay = 0;
+
+  if (WebRtc_InitDelayEstimatorFarend(aec->delay_estimator_farend) != 0) {
+    return -1;
+  }
+  if (WebRtc_InitDelayEstimator(aec->delay_estimator) != 0) {
+    return -1;
+  }
+  aec->delay_logging_enabled = 0;
+  aec->delay_metrics_delivered = 0;
+  memset(aec->delay_histogram, 0, sizeof(aec->delay_histogram));
+  aec->num_delay_values = 0;
+  aec->delay_median = -1;
+  aec->delay_std = -1;
+  aec->fraction_poor_delays = -1.0f;
+
+  aec->signal_delay_correction = 0;
+  aec->previous_delay = -2;  // (-2): Uninitialized.
+  aec->delay_correction_count = 0;
+  aec->shift_offset = kInitialShiftOffset;
+  aec->delay_quality_threshold = kDelayQualityThresholdMin;
+
+  aec->num_partitions = kNormalNumPartitions;
+
+  // Update the delay estimator with filter length.  We use half the
+  // |num_partitions| to take the echo path into account.  In practice we say
+  // that the echo has a duration of maximum half |num_partitions|, which is not
+  // true, but serves as a crude measure.
+  WebRtc_set_allowed_offset(aec->delay_estimator, aec->num_partitions / 2);
+  // TODO(bjornv): I currently hard coded the enable.  Once we've established
+  // that AECM has no performance regression, robust_validation will be enabled
+  // all the time and the APIs to turn it on/off will be removed.  Hence, remove
+  // this line then.
+  WebRtc_enable_robust_validation(aec->delay_estimator, 1);
+  aec->frame_count = 0;
+
+  // Default target suppression mode.
+  aec->nlp_mode = 1;
+
+  // Sampling frequency multiplier w.r.t. 8 kHz.
+  // In case of multiple bands we process the lower band in 16 kHz, hence the
+  // multiplier is always 2.
+  if (aec->num_bands > 1) {
+    aec->mult = 2;
+  } else {
+    aec->mult = (short)aec->sampFreq / 8000;
+  }
+
+  aec->farBufWritePos = 0;
+  aec->farBufReadPos = 0;
+
+  aec->inSamples = 0;
+  aec->outSamples = 0;
+  aec->knownDelay = 0;
+
+  // Initialize buffers
+  memset(aec->dBuf, 0, sizeof(aec->dBuf));
+  memset(aec->eBuf, 0, sizeof(aec->eBuf));
+  // For H bands
+  for (i = 0; i < NUM_HIGH_BANDS_MAX; ++i) {
+    memset(aec->dBufH[i], 0, sizeof(aec->dBufH[i]));
+  }
+
+  memset(aec->xPow, 0, sizeof(aec->xPow));
+  memset(aec->dPow, 0, sizeof(aec->dPow));
+  memset(aec->dInitMinPow, 0, sizeof(aec->dInitMinPow));
+  aec->noisePow = aec->dInitMinPow;
+  aec->noiseEstCtr = 0;
+
+  // Initial comfort noise power
+  for (i = 0; i < PART_LEN1; i++) {
+    aec->dMinPow[i] = 1.0e6f;
+  }
+
+  // Holds the last block written to
+  aec->xfBufBlockPos = 0;
+  // TODO: Investigate need for these initializations. Deleting them doesn't
+  //       change the output at all and yields 0.4% overall speedup.
+  memset(aec->xfBuf, 0, sizeof(complex_t) * kExtendedNumPartitions * PART_LEN1);
+  memset(aec->wfBuf, 0, sizeof(complex_t) * kExtendedNumPartitions * PART_LEN1);
+  memset(aec->sde, 0, sizeof(complex_t) * PART_LEN1);
+  memset(aec->sxd, 0, sizeof(complex_t) * PART_LEN1);
+  memset(
+      aec->xfwBuf, 0, sizeof(complex_t) * kExtendedNumPartitions * PART_LEN1);
+  memset(aec->se, 0, sizeof(float) * PART_LEN1);
+
+  // To prevent numerical instability in the first block.
+  for (i = 0; i < PART_LEN1; i++) {
+    aec->sd[i] = 1;
+  }
+  for (i = 0; i < PART_LEN1; i++) {
+    aec->sx[i] = 1;
+  }
+
+  memset(aec->hNs, 0, sizeof(aec->hNs));
+  memset(aec->outBuf, 0, sizeof(float) * PART_LEN);
+
+  aec->hNlFbMin = 1;
+  aec->hNlFbLocalMin = 1;
+  aec->hNlXdAvgMin = 1;
+  aec->hNlNewMin = 0;
+  aec->hNlMinCtr = 0;
+  aec->overDrive = 2;
+  aec->overDriveSm = 2;
+  aec->delayIdx = 0;
+  aec->stNearState = 0;
+  aec->echoState = 0;
+  aec->divergeState = 0;
+
+  aec->seed = 777;
+  aec->delayEstCtr = 0;
+
+  // Metrics disabled by default
+  aec->metricsMode = 0;
+  InitMetrics(aec);
+
+  return 0;
+}
+
+void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend) {
+  float fft[PART_LEN2];
+  float xf[2][PART_LEN1];
+
+  // Check if the buffer is full, and in that case flush the oldest data.
+  if (WebRtc_available_write(aec->far_buf) < 1) {
+    WebRtcAec_MoveFarReadPtr(aec, 1);
+  }
+  // Convert far-end partition to the frequency domain without windowing.
+  memcpy(fft, farend, sizeof(float) * PART_LEN2);
+  TimeToFrequency(fft, xf, 0);
+  WebRtc_WriteBuffer(aec->far_buf, &xf[0][0], 1);
+
+  // Convert far-end partition to the frequency domain with windowing.
+  memcpy(fft, farend, sizeof(float) * PART_LEN2);
+  TimeToFrequency(fft, xf, 1);
+  WebRtc_WriteBuffer(aec->far_buf_windowed, &xf[0][0], 1);
+}
+
+int WebRtcAec_MoveFarReadPtr(AecCore* aec, int elements) {
+  int elements_moved = MoveFarReadPtrWithoutSystemDelayUpdate(aec, elements);
+  aec->system_delay -= elements_moved * PART_LEN;
+  return elements_moved;
+}
+
+void WebRtcAec_ProcessFrames(AecCore* aec,
+                             const float* const* nearend,
+                             size_t num_bands,
+                             size_t num_samples,
+                             int knownDelay,
+                             float* const* out) {
+  size_t i, j;
+  int out_elements = 0;
+
+  aec->frame_count++;
+  // For each frame the process is as follows:
+  // 1) If the system_delay indicates on being too small for processing a
+  //    frame we stuff the buffer with enough data for 10 ms.
+  // 2 a) Adjust the buffer to the system delay, by moving the read pointer.
+  //   b) Apply signal based delay correction, if we have detected poor AEC
+  //    performance.
+  // 3) TODO(bjornv): Investigate if we need to add this:
+  //    If we can't move read pointer due to buffer size limitations we
+  //    flush/stuff the buffer.
+  // 4) Process as many partitions as possible.
+  // 5) Update the |system_delay| with respect to a full frame of FRAME_LEN
+  //    samples. Even though we will have data left to process (we work with
+  //    partitions) we consider updating a whole frame, since that's the
+  //    amount of data we input and output in audio_processing.
+  // 6) Update the outputs.
+
+  // The AEC has two different delay estimation algorithms built in.  The
+  // first relies on delay input values from the user and the amount of
+  // shifted buffer elements is controlled by |knownDelay|.  This delay will
+  // give a guess on how much we need to shift far-end buffers to align with
+  // the near-end signal.  The other delay estimation algorithm uses the
+  // far- and near-end signals to find the offset between them.  This one
+  // (called "signal delay") is then used to fine tune the alignment, or
+  // simply compensate for errors in the system based one.
+  // Note that the two algorithms operate independently.  Currently, we only
+  // allow one algorithm to be turned on.
+
+  assert(aec->num_bands == num_bands);
+
+  for (j = 0; j < num_samples; j+= FRAME_LEN) {
+    // TODO(bjornv): Change the near-end buffer handling to be the same as for
+    // far-end, that is, with a near_pre_buf.
+    // Buffer the near-end frame.
+    WebRtc_WriteBuffer(aec->nearFrBuf, &nearend[0][j], FRAME_LEN);
+    // For H band
+    for (i = 1; i < num_bands; ++i) {
+      WebRtc_WriteBuffer(aec->nearFrBufH[i - 1], &nearend[i][j], FRAME_LEN);
+    }
+
+    // 1) At most we process |aec->mult|+1 partitions in 10 ms. Make sure we
+    // have enough far-end data for that by stuffing the buffer if the
+    // |system_delay| indicates others.
+    if (aec->system_delay < FRAME_LEN) {
+      // We don't have enough data so we rewind 10 ms.
+      WebRtcAec_MoveFarReadPtr(aec, -(aec->mult + 1));
+    }
+
+    if (!aec->delay_agnostic_enabled) {
+      // 2 a) Compensate for a possible change in the system delay.
+
+      // TODO(bjornv): Investigate how we should round the delay difference;
+      // right now we know that incoming |knownDelay| is underestimated when
+      // it's less than |aec->knownDelay|. We therefore, round (-32) in that
+      // direction. In the other direction, we don't have this situation, but
+      // might flush one partition too little. This can cause non-causality,
+      // which should be investigated. Maybe, allow for a non-symmetric
+      // rounding, like -16.
+      int move_elements = (aec->knownDelay - knownDelay - 32) / PART_LEN;
+      int moved_elements =
+          MoveFarReadPtrWithoutSystemDelayUpdate(aec, move_elements);
+      aec->knownDelay -= moved_elements * PART_LEN;
+    } else {
+      // 2 b) Apply signal based delay correction.
+      int move_elements = SignalBasedDelayCorrection(aec);
+      int moved_elements =
+          MoveFarReadPtrWithoutSystemDelayUpdate(aec, move_elements);
+      int far_near_buffer_diff = WebRtc_available_read(aec->far_buf) -
+          WebRtc_available_read(aec->nearFrBuf) / PART_LEN;
+      WebRtc_SoftResetDelayEstimator(aec->delay_estimator, moved_elements);
+      WebRtc_SoftResetDelayEstimatorFarend(aec->delay_estimator_farend,
+                                           moved_elements);
+      aec->signal_delay_correction += moved_elements;
+      // If we rely on reported system delay values only, a buffer underrun here
+      // can never occur since we've taken care of that in 1) above.  Here, we
+      // apply signal based delay correction and can therefore end up with
+      // buffer underruns since the delay estimation can be wrong.  We therefore
+      // stuff the buffer with enough elements if needed.
+      if (far_near_buffer_diff < 0) {
+        WebRtcAec_MoveFarReadPtr(aec, far_near_buffer_diff);
+      }
+    }
+
+    // 4) Process as many blocks as possible.
+    while (WebRtc_available_read(aec->nearFrBuf) >= PART_LEN) {
+      ProcessBlock(aec);
+    }
+
+    // 5) Update system delay with respect to the entire frame.
+    aec->system_delay -= FRAME_LEN;
+
+    // 6) Update output frame.
+    // Stuff the out buffer if we have less than a frame to output.
+    // This should only happen for the first frame.
+    out_elements = (int)WebRtc_available_read(aec->outFrBuf);
+    if (out_elements < FRAME_LEN) {
+      WebRtc_MoveReadPtr(aec->outFrBuf, out_elements - FRAME_LEN);
+      for (i = 0; i < num_bands - 1; ++i) {
+        WebRtc_MoveReadPtr(aec->outFrBufH[i], out_elements - FRAME_LEN);
+      }
+    }
+    // Obtain an output frame.
+    WebRtc_ReadBuffer(aec->outFrBuf, NULL, &out[0][j], FRAME_LEN);
+    // For H bands.
+    for (i = 1; i < num_bands; ++i) {
+      WebRtc_ReadBuffer(aec->outFrBufH[i - 1], NULL, &out[i][j], FRAME_LEN);
+    }
+  }
+}
+
+int WebRtcAec_GetDelayMetricsCore(AecCore* self, int* median, int* std,
+                                  float* fraction_poor_delays) {
+  assert(self != NULL);
+  assert(median != NULL);
+  assert(std != NULL);
+
+  if (self->delay_logging_enabled == 0) {
+    // Logging disabled.
+    return -1;
+  }
+
+  if (self->delay_metrics_delivered == 0) {
+    UpdateDelayMetrics(self);
+    self->delay_metrics_delivered = 1;
+  }
+  *median = self->delay_median;
+  *std = self->delay_std;
+  *fraction_poor_delays = self->fraction_poor_delays;
+
+  return 0;
+}
+
+int WebRtcAec_echo_state(AecCore* self) { return self->echoState; }
+
+void WebRtcAec_GetEchoStats(AecCore* self,
+                            Stats* erl,
+                            Stats* erle,
+                            Stats* a_nlp) {
+  assert(erl != NULL);
+  assert(erle != NULL);
+  assert(a_nlp != NULL);
+  *erl = self->erl;
+  *erle = self->erle;
+  *a_nlp = self->aNlp;
+}
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+void* WebRtcAec_far_time_buf(AecCore* self) { return self->far_time_buf; }
+#endif
+
+void WebRtcAec_SetConfigCore(AecCore* self,
+                             int nlp_mode,
+                             int metrics_mode,
+                             int delay_logging) {
+  assert(nlp_mode >= 0 && nlp_mode < 3);
+  self->nlp_mode = nlp_mode;
+  self->metricsMode = metrics_mode;
+  if (self->metricsMode) {
+    InitMetrics(self);
+  }
+  // Turn on delay logging if it is either set explicitly or if delay agnostic
+  // AEC is enabled (which requires delay estimates).
+  self->delay_logging_enabled = delay_logging || self->delay_agnostic_enabled;
+  if (self->delay_logging_enabled) {
+    memset(self->delay_histogram, 0, sizeof(self->delay_histogram));
+  }
+}
+
+void WebRtcAec_enable_delay_agnostic(AecCore* self, int enable) {
+  self->delay_agnostic_enabled = enable;
+}
+
+int WebRtcAec_delay_agnostic_enabled(AecCore* self) {
+  return self->delay_agnostic_enabled;
+}
+
+void WebRtcAec_enable_extended_filter(AecCore* self, int enable) {
+  self->extended_filter_enabled = enable;
+  self->num_partitions = enable ? kExtendedNumPartitions : kNormalNumPartitions;
+  // Update the delay estimator with filter length.  See InitAEC() for details.
+  WebRtc_set_allowed_offset(self->delay_estimator, self->num_partitions / 2);
+}
+
+int WebRtcAec_extended_filter_enabled(AecCore* self) {
+  return self->extended_filter_enabled;
+}
+
+int WebRtcAec_system_delay(AecCore* self) { return self->system_delay; }
+
+void WebRtcAec_SetSystemDelay(AecCore* self, int delay) {
+  assert(delay >= 0);
+  self->system_delay = delay;
+}
diff --git a/webrtc/modules/audio_processing/aec/aec_core.h b/webrtc/modules/audio_processing/aec/aec_core.h
new file mode 100644
index 0000000000..241f077524
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_core.h
@@ -0,0 +1,129 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Specifies the interface for the AEC core.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
+
+#include <stddef.h>
+
+#include "webrtc/typedefs.h"
+
+#define FRAME_LEN 80
+#define PART_LEN 64               // Length of partition
+#define PART_LEN1 (PART_LEN + 1)  // Unique fft coefficients
+#define PART_LEN2 (PART_LEN * 2)  // Length of partition * 2
+#define NUM_HIGH_BANDS_MAX  2     // Max number of high bands
+
+typedef float complex_t[2];
+// For performance reasons, some arrays of complex numbers are replaced by twice
+// as long arrays of float, all the real parts followed by all the imaginary
+// ones (complex_t[SIZE] -> float[2][SIZE]). This allows SIMD optimizations and
+// is better than two arrays (one for the real parts and one for the imaginary
+// parts) as this other way would require two pointers instead of one and cause
+// extra register spilling. This also allows the offsets to be calculated at
+// compile time.
+
+// Metrics
+enum {
+  kOffsetLevel = -100
+};
+
+typedef struct Stats {
+  float instant;
+  float average;
+  float min;
+  float max;
+  float sum;
+  float hisum;
+  float himean;
+  int counter;
+  int hicounter;
+} Stats;
+
+typedef struct AecCore AecCore;
+
+AecCore* WebRtcAec_CreateAec();  // Returns NULL on error.
+void WebRtcAec_FreeAec(AecCore* aec);
+int WebRtcAec_InitAec(AecCore* aec, int sampFreq);
+void WebRtcAec_InitAec_SSE2(void);
+#if defined(MIPS_FPU_LE)
+void WebRtcAec_InitAec_mips(void);
+#endif
+#if defined(WEBRTC_DETECT_NEON) || defined(WEBRTC_HAS_NEON)
+void WebRtcAec_InitAec_neon(void);
+#endif
+
+void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend);
+void WebRtcAec_ProcessFrames(AecCore* aec,
+                             const float* const* nearend,
+                             size_t num_bands,
+                             size_t num_samples,
+                             int knownDelay,
+                             float* const* out);
+
+// A helper function to call WebRtc_MoveReadPtr() for all far-end buffers.
+// Returns the number of elements moved, and adjusts |system_delay| by the
+// corresponding amount in ms.
+int WebRtcAec_MoveFarReadPtr(AecCore* aec, int elements);
+
+// Calculates the median, standard deviation and amount of poor values among the
+// delay estimates aggregated up to the first call to the function. After that
+// first call the metrics are aggregated and updated every second. With poor
+// values we mean values that most likely will cause the AEC to perform poorly.
+// TODO(bjornv): Consider changing tests and tools to handle constant
+// constant aggregation window throughout the session instead.
+int WebRtcAec_GetDelayMetricsCore(AecCore* self, int* median, int* std,
+                                  float* fraction_poor_delays);
+
+// Returns the echo state (1: echo, 0: no echo).
+int WebRtcAec_echo_state(AecCore* self);
+
+// Gets statistics of the echo metrics ERL, ERLE, A_NLP.
+void WebRtcAec_GetEchoStats(AecCore* self,
+                            Stats* erl,
+                            Stats* erle,
+                            Stats* a_nlp);
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+void* WebRtcAec_far_time_buf(AecCore* self);
+#endif
+
+// Sets local configuration modes.
+void WebRtcAec_SetConfigCore(AecCore* self,
+                             int nlp_mode,
+                             int metrics_mode,
+                             int delay_logging);
+
+// Non-zero enables, zero disables.
+void WebRtcAec_enable_delay_agnostic(AecCore* self, int enable);
+
+// Returns non-zero if delay agnostic (i.e., signal based delay estimation) is
+// enabled and zero if disabled.
+int WebRtcAec_delay_agnostic_enabled(AecCore* self);
+
+// Enables or disables extended filter mode. Non-zero enables, zero disables.
+void WebRtcAec_enable_extended_filter(AecCore* self, int enable);
+
+// Returns non-zero if extended filter mode is enabled and zero if disabled.
+int WebRtcAec_extended_filter_enabled(AecCore* self);
+
+// Returns the current |system_delay|, i.e., the buffered difference between
+// far-end and near-end.
+int WebRtcAec_system_delay(AecCore* self);
+
+// Sets the |system_delay| to |value|.  Note that if the value is changed
+// improperly, there can be a performance regression.  So it should be used with
+// care.
+void WebRtcAec_SetSystemDelay(AecCore* self, int delay);
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
diff --git a/webrtc/modules/audio_processing/aec/aec_core_internal.h b/webrtc/modules/audio_processing/aec/aec_core_internal.h
new file mode 100644
index 0000000000..2de028379b
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_core_internal.h
@@ -0,0 +1,202 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
+
+#include "webrtc/common_audio/ring_buffer.h"
+#include "webrtc/common_audio/wav_file.h"
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+#include "webrtc/typedefs.h"
+
+// Number of partitions for the extended filter mode. The first one is an enum
+// to be used in array declarations, as it represents the maximum filter length.
+enum {
+  kExtendedNumPartitions = 32
+};
+static const int kNormalNumPartitions = 12;
+
+// Delay estimator constants, used for logging and delay compensation if
+// if reported delays are disabled.
+enum {
+  kLookaheadBlocks = 15
+};
+enum {
+  // 500 ms for 16 kHz which is equivalent with the limit of reported delays.
+  kHistorySizeBlocks = 125
+};
+
+// Extended filter adaptation parameters.
+// TODO(ajm): No narrowband tuning yet.
+static const float kExtendedMu = 0.4f;
+static const float kExtendedErrorThreshold = 1.0e-6f;
+
+typedef struct PowerLevel {
+  float sfrsum;
+  int sfrcounter;
+  float framelevel;
+  float frsum;
+  int frcounter;
+  float minlevel;
+  float averagelevel;
+} PowerLevel;
+
+struct AecCore {
+  int farBufWritePos, farBufReadPos;
+
+  int knownDelay;
+  int inSamples, outSamples;
+  int delayEstCtr;
+
+  RingBuffer* nearFrBuf;
+  RingBuffer* outFrBuf;
+
+  RingBuffer* nearFrBufH[NUM_HIGH_BANDS_MAX];
+  RingBuffer* outFrBufH[NUM_HIGH_BANDS_MAX];
+
+  float dBuf[PART_LEN2];  // nearend
+  float eBuf[PART_LEN2];  // error
+
+  float dBufH[NUM_HIGH_BANDS_MAX][PART_LEN2];  // nearend
+
+  float xPow[PART_LEN1];
+  float dPow[PART_LEN1];
+  float dMinPow[PART_LEN1];
+  float dInitMinPow[PART_LEN1];
+  float* noisePow;
+
+  float xfBuf[2][kExtendedNumPartitions * PART_LEN1];  // farend fft buffer
+  float wfBuf[2][kExtendedNumPartitions * PART_LEN1];  // filter fft
+  complex_t sde[PART_LEN1];  // cross-psd of nearend and error
+  complex_t sxd[PART_LEN1];  // cross-psd of farend and nearend
+  // Farend windowed fft buffer.
+  complex_t xfwBuf[kExtendedNumPartitions * PART_LEN1];
+
+  float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1];  // far, near, error psd
+  float hNs[PART_LEN1];
+  float hNlFbMin, hNlFbLocalMin;
+  float hNlXdAvgMin;
+  int hNlNewMin, hNlMinCtr;
+  float overDrive, overDriveSm;
+  int nlp_mode;
+  float outBuf[PART_LEN];
+  int delayIdx;
+
+  short stNearState, echoState;
+  short divergeState;
+
+  int xfBufBlockPos;
+
+  RingBuffer* far_buf;
+  RingBuffer* far_buf_windowed;
+  int system_delay;  // Current system delay buffered in AEC.
+
+  int mult;  // sampling frequency multiple
+  int sampFreq;
+  size_t num_bands;
+  uint32_t seed;
+
+  float normal_mu;               // stepsize
+  float normal_error_threshold;  // error threshold
+
+  int noiseEstCtr;
+
+  PowerLevel farlevel;
+  PowerLevel nearlevel;
+  PowerLevel linoutlevel;
+  PowerLevel nlpoutlevel;
+
+  int metricsMode;
+  int stateCounter;
+  Stats erl;
+  Stats erle;
+  Stats aNlp;
+  Stats rerl;
+
+  // Quantities to control H band scaling for SWB input
+  int freq_avg_ic;       // initial bin for averaging nlp gain
+  int flag_Hband_cn;     // for comfort noise
+  float cn_scale_Hband;  // scale for comfort noise in H band
+
+  int delay_metrics_delivered;
+  int delay_histogram[kHistorySizeBlocks];
+  int num_delay_values;
+  int delay_median;
+  int delay_std;
+  float fraction_poor_delays;
+  int delay_logging_enabled;
+  void* delay_estimator_farend;
+  void* delay_estimator;
+  // Variables associated with delay correction through signal based delay
+  // estimation feedback.
+  int signal_delay_correction;
+  int previous_delay;
+  int delay_correction_count;
+  int shift_offset;
+  float delay_quality_threshold;
+  int frame_count;
+
+  // 0 = delay agnostic mode (signal based delay correction) disabled.
+  // Otherwise enabled.
+  int delay_agnostic_enabled;
+  // 1 = extended filter mode enabled, 0 = disabled.
+  int extended_filter_enabled;
+  // Runtime selection of number of filter partitions.
+  int num_partitions;
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  // Sequence number of this AEC instance, so that different instances can
+  // choose different dump file names.
+  int instance_index;
+
+  // Number of times we've restarted dumping; used to pick new dump file names
+  // each time.
+  int debug_dump_count;
+
+  RingBuffer* far_time_buf;
+  rtc_WavWriter* farFile;
+  rtc_WavWriter* nearFile;
+  rtc_WavWriter* outFile;
+  rtc_WavWriter* outLinearFile;
+  FILE* e_fft_file;
+#endif
+};
+
+typedef void (*WebRtcAecFilterFar)(AecCore* aec, float yf[2][PART_LEN1]);
+extern WebRtcAecFilterFar WebRtcAec_FilterFar;
+typedef void (*WebRtcAecScaleErrorSignal)(AecCore* aec, float ef[2][PART_LEN1]);
+extern WebRtcAecScaleErrorSignal WebRtcAec_ScaleErrorSignal;
+typedef void (*WebRtcAecFilterAdaptation)(AecCore* aec,
+                                          float* fft,
+                                          float ef[2][PART_LEN1]);
+extern WebRtcAecFilterAdaptation WebRtcAec_FilterAdaptation;
+typedef void (*WebRtcAecOverdriveAndSuppress)(AecCore* aec,
+                                              float hNl[PART_LEN1],
+                                              const float hNlFb,
+                                              float efw[2][PART_LEN1]);
+extern WebRtcAecOverdriveAndSuppress WebRtcAec_OverdriveAndSuppress;
+
+typedef void (*WebRtcAecComfortNoise)(AecCore* aec,
+                                      float efw[2][PART_LEN1],
+                                      complex_t* comfortNoiseHband,
+                                      const float* noisePow,
+                                      const float* lambda);
+extern WebRtcAecComfortNoise WebRtcAec_ComfortNoise;
+
+typedef void (*WebRtcAecSubBandCoherence)(AecCore* aec,
+                                          float efw[2][PART_LEN1],
+                                          float xfw[2][PART_LEN1],
+                                          float* fft,
+                                          float* cohde,
+                                          float* cohxd);
+extern WebRtcAecSubBandCoherence WebRtcAec_SubbandCoherence;
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
diff --git a/webrtc/modules/audio_processing/aec/aec_core_mips.c b/webrtc/modules/audio_processing/aec/aec_core_mips.c
new file mode 100644
index 0000000000..bb33087aee
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_core_mips.c
@@ -0,0 +1,774 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The core AEC algorithm, which is presented with time-aligned signals.
+ */
+
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+
+#include <math.h>
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+
+static const int flagHbandCn = 1; // flag for adding comfort noise in H band
+extern const float WebRtcAec_weightCurve[65];
+extern const float WebRtcAec_overDriveCurve[65];
+
+void WebRtcAec_ComfortNoise_mips(AecCore* aec,
+                                 float efw[2][PART_LEN1],
+                                 complex_t* comfortNoiseHband,
+                                 const float* noisePow,
+                                 const float* lambda) {
+  int i, num;
+  float rand[PART_LEN];
+  float noise, noiseAvg, tmp, tmpAvg;
+  int16_t randW16[PART_LEN];
+  complex_t u[PART_LEN1];
+
+  const float pi2 = 6.28318530717959f;
+  const float pi2t = pi2 / 32768;
+
+  // Generate a uniform random array on [0 1]
+  WebRtcSpl_RandUArray(randW16, PART_LEN, &aec->seed);
+
+  int16_t* randWptr = randW16;
+  float randTemp, randTemp2, randTemp3, randTemp4;
+  int32_t tmp1s, tmp2s, tmp3s, tmp4s;
+
+  for (i = 0; i < PART_LEN; i+=4) {
+    __asm __volatile (
+      ".set     push                                           \n\t"
+      ".set     noreorder                                      \n\t"
+      "lh       %[tmp1s],       0(%[randWptr])                 \n\t"
+      "lh       %[tmp2s],       2(%[randWptr])                 \n\t"
+      "lh       %[tmp3s],       4(%[randWptr])                 \n\t"
+      "lh       %[tmp4s],       6(%[randWptr])                 \n\t"
+      "mtc1     %[tmp1s],       %[randTemp]                    \n\t"
+      "mtc1     %[tmp2s],       %[randTemp2]                   \n\t"
+      "mtc1     %[tmp3s],       %[randTemp3]                   \n\t"
+      "mtc1     %[tmp4s],       %[randTemp4]                   \n\t"
+      "cvt.s.w  %[randTemp],    %[randTemp]                    \n\t"
+      "cvt.s.w  %[randTemp2],   %[randTemp2]                   \n\t"
+      "cvt.s.w  %[randTemp3],   %[randTemp3]                   \n\t"
+      "cvt.s.w  %[randTemp4],   %[randTemp4]                   \n\t"
+      "addiu    %[randWptr],    %[randWptr],      8            \n\t"
+      "mul.s    %[randTemp],    %[randTemp],      %[pi2t]      \n\t"
+      "mul.s    %[randTemp2],   %[randTemp2],     %[pi2t]      \n\t"
+      "mul.s    %[randTemp3],   %[randTemp3],     %[pi2t]      \n\t"
+      "mul.s    %[randTemp4],   %[randTemp4],     %[pi2t]      \n\t"
+      ".set     pop                                            \n\t"
+      : [randWptr] "+r" (randWptr), [randTemp] "=&f" (randTemp),
+        [randTemp2] "=&f" (randTemp2), [randTemp3] "=&f" (randTemp3),
+        [randTemp4] "=&f" (randTemp4), [tmp1s] "=&r" (tmp1s),
+        [tmp2s] "=&r" (tmp2s), [tmp3s] "=&r" (tmp3s),
+        [tmp4s] "=&r" (tmp4s)
+      : [pi2t] "f" (pi2t)
+      : "memory"
+    );
+
+    u[i+1][0] = cosf(randTemp);
+    u[i+1][1] = sinf(randTemp);
+    u[i+2][0] = cosf(randTemp2);
+    u[i+2][1] = sinf(randTemp2);
+    u[i+3][0] = cosf(randTemp3);
+    u[i+3][1] = sinf(randTemp3);
+    u[i+4][0] = cosf(randTemp4);
+    u[i+4][1] = sinf(randTemp4);
+  }
+
+  // Reject LF noise
+  float* u_ptr = &u[1][0];
+  float noise2, noise3, noise4;
+  float tmp1f, tmp2f, tmp3f, tmp4f, tmp5f, tmp6f, tmp7f, tmp8f;
+
+  u[0][0] = 0;
+  u[0][1] = 0;
+  for (i = 1; i < PART_LEN1; i+=4) {
+    __asm __volatile (
+      ".set     push                                            \n\t"
+      ".set     noreorder                                       \n\t"
+      "lwc1     %[noise],       4(%[noisePow])                  \n\t"
+      "lwc1     %[noise2],      8(%[noisePow])                  \n\t"
+      "lwc1     %[noise3],      12(%[noisePow])                 \n\t"
+      "lwc1     %[noise4],      16(%[noisePow])                 \n\t"
+      "sqrt.s   %[noise],       %[noise]                        \n\t"
+      "sqrt.s   %[noise2],      %[noise2]                       \n\t"
+      "sqrt.s   %[noise3],      %[noise3]                       \n\t"
+      "sqrt.s   %[noise4],      %[noise4]                       \n\t"
+      "lwc1     %[tmp1f],       0(%[u_ptr])                     \n\t"
+      "lwc1     %[tmp2f],       4(%[u_ptr])                     \n\t"
+      "lwc1     %[tmp3f],       8(%[u_ptr])                     \n\t"
+      "lwc1     %[tmp4f],       12(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp5f],       16(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp6f],       20(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp7f],       24(%[u_ptr])                    \n\t"
+      "lwc1     %[tmp8f],       28(%[u_ptr])                    \n\t"
+      "addiu    %[noisePow],    %[noisePow],      16            \n\t"
+      "mul.s    %[tmp1f],       %[tmp1f],         %[noise]      \n\t"
+      "mul.s    %[tmp2f],       %[tmp2f],         %[noise]      \n\t"
+      "mul.s    %[tmp3f],       %[tmp3f],         %[noise2]     \n\t"
+      "mul.s    %[tmp4f],       %[tmp4f],         %[noise2]     \n\t"
+      "mul.s    %[tmp5f],       %[tmp5f],         %[noise3]     \n\t"
+      "mul.s    %[tmp6f],       %[tmp6f],         %[noise3]     \n\t"
+      "swc1     %[tmp1f],       0(%[u_ptr])                     \n\t"
+      "swc1     %[tmp3f],       8(%[u_ptr])                     \n\t"
+      "mul.s    %[tmp8f],       %[tmp8f],         %[noise4]     \n\t"
+      "mul.s    %[tmp7f],       %[tmp7f],         %[noise4]     \n\t"
+      "neg.s    %[tmp2f]                                        \n\t"
+      "neg.s    %[tmp4f]                                        \n\t"
+      "neg.s    %[tmp6f]                                        \n\t"
+      "neg.s    %[tmp8f]                                        \n\t"
+      "swc1     %[tmp5f],       16(%[u_ptr])                    \n\t"
+      "swc1     %[tmp7f],       24(%[u_ptr])                    \n\t"
+      "swc1     %[tmp2f],       4(%[u_ptr])                     \n\t"
+      "swc1     %[tmp4f],       12(%[u_ptr])                    \n\t"
+      "swc1     %[tmp6f],       20(%[u_ptr])                    \n\t"
+      "swc1     %[tmp8f],       28(%[u_ptr])                    \n\t"
+      "addiu    %[u_ptr],       %[u_ptr],         32            \n\t"
+      ".set     pop                                             \n\t"
+      : [u_ptr] "+r" (u_ptr),  [noisePow] "+r" (noisePow),
+        [noise] "=&f" (noise), [noise2] "=&f" (noise2),
+        [noise3] "=&f" (noise3), [noise4] "=&f" (noise4),
+        [tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f),
+        [tmp3f] "=&f" (tmp3f), [tmp4f] "=&f" (tmp4f),
+        [tmp5f] "=&f" (tmp5f), [tmp6f] "=&f" (tmp6f),
+        [tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f)
+      :
+      : "memory"
+    );
+  }
+  u[PART_LEN][1] = 0;
+  noisePow -= PART_LEN;
+
+  u_ptr = &u[0][0];
+  float* u_ptr_end = &u[PART_LEN][0];
+  float* efw_ptr_0 = &efw[0][0];
+  float* efw_ptr_1 = &efw[1][0];
+  float tmp9f, tmp10f;
+  const float tmp1c = 1.0;
+
+  __asm __volatile (
+    ".set     push                                                        \n\t"
+    ".set     noreorder                                                   \n\t"
+   "1:                                                                    \n\t"
+    "lwc1     %[tmp1f],       0(%[lambda])                                \n\t"
+    "lwc1     %[tmp6f],       4(%[lambda])                                \n\t"
+    "addiu    %[lambda],      %[lambda],        8                         \n\t"
+    "c.lt.s   %[tmp1f],       %[tmp1c]                                    \n\t"
+    "bc1f     4f                                                          \n\t"
+    " nop                                                                 \n\t"
+    "c.lt.s   %[tmp6f],       %[tmp1c]                                    \n\t"
+    "bc1f     3f                                                          \n\t"
+    " nop                                                                 \n\t"
+   "2:                                                                    \n\t"
+    "mul.s    %[tmp1f],       %[tmp1f],         %[tmp1f]                  \n\t"
+    "mul.s    %[tmp6f],       %[tmp6f],         %[tmp6f]                  \n\t"
+    "sub.s    %[tmp1f],       %[tmp1c],         %[tmp1f]                  \n\t"
+    "sub.s    %[tmp6f],       %[tmp1c],         %[tmp6f]                  \n\t"
+    "sqrt.s   %[tmp1f],       %[tmp1f]                                    \n\t"
+    "sqrt.s   %[tmp6f],       %[tmp6f]                                    \n\t"
+    "lwc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp3f],       0(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp8f],       8(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp5f],       4(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp10f],      12(%[u_ptr])                                \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp3f]                  \n\t"
+    "add.s    %[tmp2f],       %[tmp2f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp5f]                  \n\t"
+    "add.s    %[tmp4f],       %[tmp4f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp8f]                  \n\t"
+    "add.s    %[tmp7f],       %[tmp7f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp10f]                 \n\t"
+    "add.s    %[tmp9f],       %[tmp9f],         %[tmp3f]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s   %[tmp2f],       %[tmp2f],         %[tmp1f],     %[tmp3f]    \n\t"
+    "madd.s   %[tmp4f],       %[tmp4f],         %[tmp1f],     %[tmp5f]    \n\t"
+    "madd.s   %[tmp7f],       %[tmp7f],         %[tmp6f],     %[tmp8f]    \n\t"
+    "madd.s   %[tmp9f],       %[tmp9f],         %[tmp6f],     %[tmp10f]   \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "swc1     %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+    "swc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "b        5f                                                          \n\t"
+    " swc1    %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+   "3:                                                                    \n\t"
+    "mul.s    %[tmp1f],       %[tmp1f],         %[tmp1f]                  \n\t"
+    "sub.s    %[tmp1f],       %[tmp1c],         %[tmp1f]                  \n\t"
+    "sqrt.s   %[tmp1f],       %[tmp1f]                                    \n\t"
+    "lwc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp3f],       0(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp5f],       4(%[u_ptr])                                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp3f]                  \n\t"
+    "add.s    %[tmp2f],       %[tmp2f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp1f],         %[tmp5f]                  \n\t"
+    "add.s    %[tmp4f],       %[tmp4f],         %[tmp3f]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s   %[tmp2f],       %[tmp2f],         %[tmp1f],     %[tmp3f]    \n\t"
+    "madd.s   %[tmp4f],       %[tmp4f],         %[tmp1f],     %[tmp5f]    \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1     %[tmp2f],       0(%[efw_ptr_0])                             \n\t"
+    "b        5f                                                          \n\t"
+    " swc1    %[tmp4f],       0(%[efw_ptr_1])                             \n\t"
+   "4:                                                                    \n\t"
+    "c.lt.s   %[tmp6f],       %[tmp1c]                                    \n\t"
+    "bc1f     5f                                                          \n\t"
+    " nop                                                                 \n\t"
+    "mul.s    %[tmp6f],       %[tmp6f],         %[tmp6f]                  \n\t"
+    "sub.s    %[tmp6f],       %[tmp1c],         %[tmp6f]                  \n\t"
+    "sqrt.s   %[tmp6f],       %[tmp6f]                                    \n\t"
+    "lwc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "lwc1     %[tmp8f],       8(%[u_ptr])                                 \n\t"
+    "lwc1     %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+    "lwc1     %[tmp10f],      12(%[u_ptr])                                \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp8f]                  \n\t"
+    "add.s    %[tmp7f],       %[tmp7f],         %[tmp3f]                  \n\t"
+    "mul.s    %[tmp3f],       %[tmp6f],         %[tmp10f]                 \n\t"
+    "add.s    %[tmp9f],       %[tmp9f],         %[tmp3f]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+    "madd.s   %[tmp7f],       %[tmp7f],         %[tmp6f],     %[tmp8f]    \n\t"
+    "madd.s   %[tmp9f],       %[tmp9f],         %[tmp6f],     %[tmp10f]   \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+    "swc1     %[tmp7f],       4(%[efw_ptr_0])                             \n\t"
+    "swc1     %[tmp9f],       4(%[efw_ptr_1])                             \n\t"
+   "5:                                                                    \n\t"
+    "addiu    %[u_ptr],       %[u_ptr],         16                        \n\t"
+    "addiu    %[efw_ptr_0],   %[efw_ptr_0],     8                         \n\t"
+    "bne      %[u_ptr],       %[u_ptr_end],     1b                        \n\t"
+    " addiu   %[efw_ptr_1],   %[efw_ptr_1],     8                         \n\t"
+    ".set     pop                                                         \n\t"
+    : [lambda] "+r" (lambda), [u_ptr] "+r" (u_ptr),
+      [efw_ptr_0] "+r" (efw_ptr_0), [efw_ptr_1] "+r" (efw_ptr_1),
+      [tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f), [tmp3f] "=&f" (tmp3f),
+      [tmp4f] "=&f" (tmp4f), [tmp5f] "=&f" (tmp5f),
+      [tmp6f] "=&f" (tmp6f), [tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f),
+      [tmp9f] "=&f" (tmp9f), [tmp10f] "=&f" (tmp10f)
+    : [tmp1c] "f" (tmp1c), [u_ptr_end] "r" (u_ptr_end)
+    : "memory"
+  );
+
+  lambda -= PART_LEN;
+  tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[PART_LEN] * lambda[PART_LEN], 0));
+  //tmp = 1 - lambda[i];
+  efw[0][PART_LEN] += tmp * u[PART_LEN][0];
+  efw[1][PART_LEN] += tmp * u[PART_LEN][1];
+
+  // For H band comfort noise
+  // TODO: don't compute noise and "tmp" twice. Use the previous results.
+  noiseAvg = 0.0;
+  tmpAvg = 0.0;
+  num = 0;
+  if ((aec->sampFreq == 32000 || aec->sampFreq == 48000) && flagHbandCn == 1) {
+    for (i = 0; i < PART_LEN; i++) {
+      rand[i] = ((float)randW16[i]) / 32768;
+    }
+
+    // average noise scale
+    // average over second half of freq spectrum (i.e., 4->8khz)
+    // TODO: we shouldn't need num. We know how many elements we're summing.
+    for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
+      num++;
+      noiseAvg += sqrtf(noisePow[i]);
+    }
+    noiseAvg /= (float)num;
+
+    // average nlp scale
+    // average over second half of freq spectrum (i.e., 4->8khz)
+    // TODO: we shouldn't need num. We know how many elements we're summing.
+    num = 0;
+    for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
+      num++;
+      tmpAvg += sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
+    }
+    tmpAvg /= (float)num;
+
+    // Use average noise for H band
+    // TODO: we should probably have a new random vector here.
+    // Reject LF noise
+    u[0][0] = 0;
+    u[0][1] = 0;
+    for (i = 1; i < PART_LEN1; i++) {
+      tmp = pi2 * rand[i - 1];
+
+      // Use average noise for H band
+      u[i][0] = noiseAvg * (float)cos(tmp);
+      u[i][1] = -noiseAvg * (float)sin(tmp);
+    }
+    u[PART_LEN][1] = 0;
+
+    for (i = 0; i < PART_LEN1; i++) {
+      // Use average NLP weight for H band
+      comfortNoiseHband[i][0] = tmpAvg * u[i][0];
+      comfortNoiseHband[i][1] = tmpAvg * u[i][1];
+    }
+  }
+}
+
+void WebRtcAec_FilterFar_mips(AecCore* aec, float yf[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < aec->num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
+    int pos = i * PART_LEN1;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >=  aec->num_partitions) {
+      xPos -=  aec->num_partitions * (PART_LEN1);
+    }
+    float* yf0 = yf[0];
+    float* yf1 = yf[1];
+    float* aRe = aec->xfBuf[0] + xPos;
+    float* aIm = aec->xfBuf[1] + xPos;
+    float* bRe = aec->wfBuf[0] + pos;
+    float* bIm = aec->wfBuf[1] + pos;
+    float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13;
+    int len = PART_LEN1 >> 1;
+
+    __asm __volatile (
+      ".set       push                                                \n\t"
+      ".set       noreorder                                           \n\t"
+     "1:                                                              \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "lwc1       %[f4],      4(%[aRe])                               \n\t"
+      "lwc1       %[f5],      4(%[bRe])                               \n\t"
+      "lwc1       %[f6],      4(%[bIm])                               \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+      "mul.s      %[f0],      %[f0],          %[f2]                   \n\t"
+      "mul.s      %[f9],      %[f4],          %[f5]                   \n\t"
+      "mul.s      %[f4],      %[f4],          %[f6]                   \n\t"
+      "lwc1       %[f7],      4(%[aIm])                               \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f12],     %[f2],          %[f3]                   \n\t"
+      "mul.s      %[f1],      %[f3],          %[f1]                   \n\t"
+      "mul.s      %[f11],     %[f6],          %[f7]                   \n\t"
+      "addiu      %[aRe],     %[aRe],         8                       \n\t"
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "sub.s      %[f8],      %[f8],          %[f12]                  \n\t"
+      "mul.s      %[f12],     %[f7],          %[f5]                   \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "add.s      %[f1],      %[f0],          %[f1]                   \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+      "sub.s      %[f9],      %[f9],          %[f11]                  \n\t"
+      "lwc1       %[f6],      4(%[yf0])                               \n\t"
+      "add.s      %[f4],      %[f4],          %[f12]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "addiu      %[aRe],     %[aRe],         8                       \n\t"
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "nmsub.s    %[f8],      %[f8],          %[f2],      %[f3]       \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "madd.s     %[f1],      %[f0],          %[f3],      %[f1]       \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+      "nmsub.s    %[f9],      %[f9],          %[f6],      %[f7]       \n\t"
+      "lwc1       %[f6],      4(%[yf0])                               \n\t"
+      "madd.s     %[f4],      %[f4],          %[f7],      %[f5]       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "lwc1       %[f5],      4(%[yf1])                               \n\t"
+      "add.s      %[f2],      %[f2],          %[f8]                   \n\t"
+      "addiu      %[bRe],     %[bRe],         8                       \n\t"
+      "addiu      %[bIm],     %[bIm],         8                       \n\t"
+      "add.s      %[f3],      %[f3],          %[f1]                   \n\t"
+      "add.s      %[f6],      %[f6],          %[f9]                   \n\t"
+      "add.s      %[f5],      %[f5],          %[f4]                   \n\t"
+      "swc1       %[f2],      0(%[yf0])                               \n\t"
+      "swc1       %[f3],      0(%[yf1])                               \n\t"
+      "swc1       %[f6],      4(%[yf0])                               \n\t"
+      "swc1       %[f5],      4(%[yf1])                               \n\t"
+      "addiu      %[yf0],     %[yf0],         8                       \n\t"
+      "bgtz       %[len],     1b                                      \n\t"
+      " addiu     %[yf1],     %[yf1],         8                       \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+      "mul.s      %[f0],      %[f0],          %[f2]                   \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f12],     %[f2],          %[f3]                   \n\t"
+      "mul.s      %[f1],      %[f3],          %[f1]                   \n\t"
+      "sub.s      %[f8],      %[f8],          %[f12]                  \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "add.s      %[f1],      %[f0],          %[f1]                   \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "nmsub.s    %[f8],      %[f8],          %[f2],      %[f3]       \n\t"
+      "lwc1       %[f2],      0(%[yf0])                               \n\t"
+      "madd.s     %[f1],      %[f0],          %[f3],      %[f1]       \n\t"
+      "lwc1       %[f3],      0(%[yf1])                               \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "add.s      %[f2],      %[f2],          %[f8]                   \n\t"
+      "add.s      %[f3],      %[f3],          %[f1]                   \n\t"
+      "swc1       %[f2],      0(%[yf0])                               \n\t"
+      "swc1       %[f3],      0(%[yf1])                               \n\t"
+      ".set       pop                                                 \n\t"
+      : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+        [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+        [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+        [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
+        [f12] "=&f" (f12), [f13] "=&f" (f13), [aRe] "+r" (aRe),
+        [aIm] "+r" (aIm), [bRe] "+r" (bRe), [bIm] "+r" (bIm),
+        [yf0] "+r" (yf0), [yf1] "+r" (yf1), [len] "+r" (len)
+      :
+      : "memory"
+    );
+  }
+}
+
+void WebRtcAec_FilterAdaptation_mips(AecCore* aec,
+                                     float* fft,
+                                     float ef[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < aec->num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
+    int pos;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= aec->num_partitions) {
+      xPos -= aec->num_partitions * PART_LEN1;
+    }
+
+    pos = i * PART_LEN1;
+    float* aRe = aec->xfBuf[0] + xPos;
+    float* aIm = aec->xfBuf[1] + xPos;
+    float* bRe = ef[0];
+    float* bIm = ef[1];
+    float* fft_tmp;
+
+    float f0, f1, f2, f3, f4, f5, f6 ,f7, f8, f9, f10, f11, f12;
+    int len = PART_LEN >> 1;
+
+    __asm __volatile (
+      ".set       push                                                \n\t"
+      ".set       noreorder                                           \n\t"
+      "addiu      %[fft_tmp], %[fft],         0                       \n\t"
+     "1:                                                              \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f4],      4(%[aRe])                               \n\t"
+      "lwc1       %[f5],      4(%[bRe])                               \n\t"
+      "lwc1       %[f6],      4(%[bIm])                               \n\t"
+      "addiu      %[aRe],     %[aRe],         8                       \n\t"
+      "addiu      %[bRe],     %[bRe],         8                       \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+      "mul.s      %[f0],      %[f0],          %[f2]                   \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "mul.s      %[f9],      %[f4],          %[f5]                   \n\t"
+      "lwc1       %[f7],      4(%[aIm])                               \n\t"
+      "mul.s      %[f4],      %[f4],          %[f6]                   \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f10],     %[f3],          %[f2]                   \n\t"
+      "mul.s      %[f1],      %[f3],          %[f1]                   \n\t"
+      "mul.s      %[f11],     %[f7],          %[f6]                   \n\t"
+      "mul.s      %[f5],      %[f7],          %[f5]                   \n\t"
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[bIm],     %[bIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "add.s      %[f8],      %[f8],          %[f10]                  \n\t"
+      "sub.s      %[f1],      %[f0],          %[f1]                   \n\t"
+      "add.s      %[f9],      %[f9],          %[f11]                  \n\t"
+      "sub.s      %[f5],      %[f4],          %[f5]                   \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "addiu      %[aIm],     %[aIm],         8                       \n\t"
+      "addiu      %[bIm],     %[bIm],         8                       \n\t"
+      "addiu      %[len],     %[len],         -1                      \n\t"
+      "madd.s     %[f8],      %[f8],          %[f3],      %[f2]       \n\t"
+      "nmsub.s    %[f1],      %[f0],          %[f3],      %[f1]       \n\t"
+      "madd.s     %[f9],      %[f9],          %[f7],      %[f6]       \n\t"
+      "nmsub.s    %[f5],      %[f4],          %[f7],      %[f5]       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "swc1       %[f8],      0(%[fft_tmp])                           \n\t"
+      "swc1       %[f1],      4(%[fft_tmp])                           \n\t"
+      "swc1       %[f9],      8(%[fft_tmp])                           \n\t"
+      "swc1       %[f5],      12(%[fft_tmp])                          \n\t"
+      "bgtz       %[len],     1b                                      \n\t"
+      " addiu     %[fft_tmp], %[fft_tmp],     16                      \n\t"
+      "lwc1       %[f0],      0(%[aRe])                               \n\t"
+      "lwc1       %[f1],      0(%[bRe])                               \n\t"
+      "lwc1       %[f2],      0(%[bIm])                               \n\t"
+      "lwc1       %[f3],      0(%[aIm])                               \n\t"
+      "mul.s      %[f8],      %[f0],          %[f1]                   \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s      %[f10],     %[f3],          %[f2]                   \n\t"
+      "add.s      %[f8],      %[f8],          %[f10]                  \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "madd.s     %[f8],      %[f8],          %[f3],      %[f2]       \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "swc1       %[f8],      4(%[fft])                               \n\t"
+      ".set       pop                                                 \n\t"
+      : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+        [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+        [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+        [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
+        [f12] "=&f" (f12), [aRe] "+r" (aRe), [aIm] "+r" (aIm),
+        [bRe] "+r" (bRe), [bIm] "+r" (bIm), [fft_tmp] "=&r" (fft_tmp),
+        [len] "+r" (len)
+      : [fft] "r" (fft)
+      : "memory"
+    );
+
+    aec_rdft_inverse_128(fft);
+    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
+
+    // fft scaling
+    {
+      float scale = 2.0f / PART_LEN2;
+      __asm __volatile (
+        ".set     push                                    \n\t"
+        ".set     noreorder                               \n\t"
+        "addiu    %[fft_tmp], %[fft],        0            \n\t"
+        "addiu    %[len],     $zero,         8            \n\t"
+       "1:                                                \n\t"
+        "addiu    %[len],     %[len],        -1           \n\t"
+        "lwc1     %[f0],      0(%[fft_tmp])               \n\t"
+        "lwc1     %[f1],      4(%[fft_tmp])               \n\t"
+        "lwc1     %[f2],      8(%[fft_tmp])               \n\t"
+        "lwc1     %[f3],      12(%[fft_tmp])              \n\t"
+        "mul.s    %[f0],      %[f0],         %[scale]     \n\t"
+        "mul.s    %[f1],      %[f1],         %[scale]     \n\t"
+        "mul.s    %[f2],      %[f2],         %[scale]     \n\t"
+        "mul.s    %[f3],      %[f3],         %[scale]     \n\t"
+        "lwc1     %[f4],      16(%[fft_tmp])              \n\t"
+        "lwc1     %[f5],      20(%[fft_tmp])              \n\t"
+        "lwc1     %[f6],      24(%[fft_tmp])              \n\t"
+        "lwc1     %[f7],      28(%[fft_tmp])              \n\t"
+        "mul.s    %[f4],      %[f4],         %[scale]     \n\t"
+        "mul.s    %[f5],      %[f5],         %[scale]     \n\t"
+        "mul.s    %[f6],      %[f6],         %[scale]     \n\t"
+        "mul.s    %[f7],      %[f7],         %[scale]     \n\t"
+        "swc1     %[f0],      0(%[fft_tmp])               \n\t"
+        "swc1     %[f1],      4(%[fft_tmp])               \n\t"
+        "swc1     %[f2],      8(%[fft_tmp])               \n\t"
+        "swc1     %[f3],      12(%[fft_tmp])              \n\t"
+        "swc1     %[f4],      16(%[fft_tmp])              \n\t"
+        "swc1     %[f5],      20(%[fft_tmp])              \n\t"
+        "swc1     %[f6],      24(%[fft_tmp])              \n\t"
+        "swc1     %[f7],      28(%[fft_tmp])              \n\t"
+        "bgtz     %[len],     1b                          \n\t"
+        " addiu   %[fft_tmp], %[fft_tmp],    32           \n\t"
+        ".set     pop                                     \n\t"
+        : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+          [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+          [f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
+          [fft_tmp] "=&r" (fft_tmp)
+        : [scale] "f" (scale), [fft] "r" (fft)
+        : "memory"
+      );
+    }
+    aec_rdft_forward_128(fft);
+    aRe = aec->wfBuf[0] + pos;
+    aIm = aec->wfBuf[1] + pos;
+    __asm __volatile (
+      ".set     push                                    \n\t"
+      ".set     noreorder                               \n\t"
+      "addiu    %[fft_tmp], %[fft],        0            \n\t"
+      "addiu    %[len],     $zero,         31           \n\t"
+      "lwc1     %[f0],      0(%[aRe])                   \n\t"
+      "lwc1     %[f1],      0(%[fft_tmp])               \n\t"
+      "lwc1     %[f2],      256(%[aRe])                 \n\t"
+      "lwc1     %[f3],      4(%[fft_tmp])               \n\t"
+      "lwc1     %[f4],      4(%[aRe])                   \n\t"
+      "lwc1     %[f5],      8(%[fft_tmp])               \n\t"
+      "lwc1     %[f6],      4(%[aIm])                   \n\t"
+      "lwc1     %[f7],      12(%[fft_tmp])              \n\t"
+      "add.s    %[f0],      %[f0],         %[f1]        \n\t"
+      "add.s    %[f2],      %[f2],         %[f3]        \n\t"
+      "add.s    %[f4],      %[f4],         %[f5]        \n\t"
+      "add.s    %[f6],      %[f6],         %[f7]        \n\t"
+      "addiu    %[fft_tmp], %[fft_tmp],    16           \n\t"
+      "swc1     %[f0],      0(%[aRe])                   \n\t"
+      "swc1     %[f2],      256(%[aRe])                 \n\t"
+      "swc1     %[f4],      4(%[aRe])                   \n\t"
+      "addiu    %[aRe],     %[aRe],        8            \n\t"
+      "swc1     %[f6],      4(%[aIm])                   \n\t"
+      "addiu    %[aIm],     %[aIm],        8            \n\t"
+     "1:                                                \n\t"
+      "lwc1     %[f0],      0(%[aRe])                   \n\t"
+      "lwc1     %[f1],      0(%[fft_tmp])               \n\t"
+      "lwc1     %[f2],      0(%[aIm])                   \n\t"
+      "lwc1     %[f3],      4(%[fft_tmp])               \n\t"
+      "lwc1     %[f4],      4(%[aRe])                   \n\t"
+      "lwc1     %[f5],      8(%[fft_tmp])               \n\t"
+      "lwc1     %[f6],      4(%[aIm])                   \n\t"
+      "lwc1     %[f7],      12(%[fft_tmp])              \n\t"
+      "add.s    %[f0],      %[f0],         %[f1]        \n\t"
+      "add.s    %[f2],      %[f2],         %[f3]        \n\t"
+      "add.s    %[f4],      %[f4],         %[f5]        \n\t"
+      "add.s    %[f6],      %[f6],         %[f7]        \n\t"
+      "addiu    %[len],     %[len],        -1           \n\t"
+      "addiu    %[fft_tmp], %[fft_tmp],    16           \n\t"
+      "swc1     %[f0],      0(%[aRe])                   \n\t"
+      "swc1     %[f2],      0(%[aIm])                   \n\t"
+      "swc1     %[f4],      4(%[aRe])                   \n\t"
+      "addiu    %[aRe],     %[aRe],        8            \n\t"
+      "swc1     %[f6],      4(%[aIm])                   \n\t"
+      "bgtz     %[len],     1b                          \n\t"
+      " addiu   %[aIm],     %[aIm],        8            \n\t"
+      ".set     pop                                     \n\t"
+      : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+        [f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
+        [f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
+        [fft_tmp] "=&r" (fft_tmp), [aRe] "+r" (aRe), [aIm] "+r" (aIm)
+      : [fft] "r" (fft)
+      : "memory"
+    );
+  }
+}
+
+void WebRtcAec_OverdriveAndSuppress_mips(AecCore* aec,
+                                         float hNl[PART_LEN1],
+                                         const float hNlFb,
+                                         float efw[2][PART_LEN1]) {
+  int i;
+  const float one = 1.0;
+  float* p_hNl;
+  float* p_efw0;
+  float* p_efw1;
+  float* p_WebRtcAec_wC;
+  float temp1, temp2, temp3, temp4;
+
+  p_hNl = &hNl[0];
+  p_efw0 = &efw[0][0];
+  p_efw1 = &efw[1][0];
+  p_WebRtcAec_wC = (float*)&WebRtcAec_weightCurve[0];
+
+  for (i = 0; i < PART_LEN1; i++) {
+    // Weight subbands
+    __asm __volatile (
+      ".set      push                                              \n\t"
+      ".set      noreorder                                         \n\t"
+      "lwc1      %[temp1],    0(%[p_hNl])                          \n\t"
+      "lwc1      %[temp2],    0(%[p_wC])                           \n\t"
+      "c.lt.s    %[hNlFb],    %[temp1]                             \n\t"
+      "bc1f      1f                                                \n\t"
+      " mul.s    %[temp3],    %[temp2],     %[hNlFb]               \n\t"
+      "sub.s     %[temp4],    %[one],       %[temp2]               \n\t"
+#if !defined(MIPS32_R2_LE)
+      "mul.s     %[temp1],    %[temp1],     %[temp4]               \n\t"
+      "add.s     %[temp1],    %[temp3],     %[temp1]               \n\t"
+#else // #if !defined(MIPS32_R2_LE)
+      "madd.s    %[temp1],    %[temp3],     %[temp1],   %[temp4]   \n\t"
+#endif // #if !defined(MIPS32_R2_LE)
+      "swc1      %[temp1],    0(%[p_hNl])                          \n\t"
+     "1:                                                           \n\t"
+      "addiu     %[p_wC],     %[p_wC],      4                      \n\t"
+      ".set      pop                                               \n\t"
+      : [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
+        [temp4] "=&f" (temp4), [p_wC] "+r" (p_WebRtcAec_wC)
+      : [hNlFb] "f" (hNlFb), [one] "f" (one), [p_hNl] "r" (p_hNl)
+      : "memory"
+    );
+
+    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
+
+    __asm __volatile (
+      "lwc1      %[temp1],    0(%[p_hNl])              \n\t"
+      "lwc1      %[temp3],    0(%[p_efw1])             \n\t"
+      "lwc1      %[temp2],    0(%[p_efw0])             \n\t"
+      "addiu     %[p_hNl],    %[p_hNl],     4          \n\t"
+      "mul.s     %[temp3],    %[temp3],     %[temp1]   \n\t"
+      "mul.s     %[temp2],    %[temp2],     %[temp1]   \n\t"
+      "addiu     %[p_efw0],   %[p_efw0],    4          \n\t"
+      "addiu     %[p_efw1],   %[p_efw1],    4          \n\t"
+      "neg.s     %[temp4],    %[temp3]                 \n\t"
+      "swc1      %[temp2],    -4(%[p_efw0])            \n\t"
+      "swc1      %[temp4],    -4(%[p_efw1])            \n\t"
+      : [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
+        [temp4] "=&f" (temp4), [p_efw0] "+r" (p_efw0), [p_efw1] "+r" (p_efw1),
+        [p_hNl] "+r" (p_hNl)
+      :
+      : "memory"
+    );
+  }
+}
+
+void WebRtcAec_ScaleErrorSignal_mips(AecCore* aec, float ef[2][PART_LEN1]) {
+  const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
+  const float error_threshold = aec->extended_filter_enabled
+                                    ? kExtendedErrorThreshold
+                                    : aec->normal_error_threshold;
+  int len = (PART_LEN1);
+  float* ef0 = ef[0];
+  float* ef1 = ef[1];
+  float* xPow = aec->xPow;
+  float fac1 = 1e-10f;
+  float err_th2 = error_threshold * error_threshold;
+  float f0, f1, f2;
+#if !defined(MIPS32_R2_LE)
+  float f3;
+#endif
+
+  __asm __volatile (
+    ".set       push                                   \n\t"
+    ".set       noreorder                              \n\t"
+   "1:                                                 \n\t"
+    "lwc1       %[f0],     0(%[xPow])                  \n\t"
+    "lwc1       %[f1],     0(%[ef0])                   \n\t"
+    "lwc1       %[f2],     0(%[ef1])                   \n\t"
+    "add.s      %[f0],     %[f0],       %[fac1]        \n\t"
+    "div.s      %[f1],     %[f1],       %[f0]          \n\t"
+    "div.s      %[f2],     %[f2],       %[f0]          \n\t"
+    "mul.s      %[f0],     %[f1],       %[f1]          \n\t"
+#if defined(MIPS32_R2_LE)
+    "madd.s     %[f0],     %[f0],       %[f2],   %[f2] \n\t"
+#else
+    "mul.s      %[f3],     %[f2],       %[f2]          \n\t"
+    "add.s      %[f0],     %[f0],       %[f3]          \n\t"
+#endif
+    "c.le.s     %[f0],     %[err_th2]                  \n\t"
+    "nop                                               \n\t"
+    "bc1t       2f                                     \n\t"
+    " nop                                              \n\t"
+    "sqrt.s     %[f0],     %[f0]                       \n\t"
+    "add.s      %[f0],     %[f0],       %[fac1]        \n\t"
+    "div.s      %[f0],     %[err_th],   %[f0]          \n\t"
+    "mul.s      %[f1],     %[f1],       %[f0]          \n\t"
+    "mul.s      %[f2],     %[f2],       %[f0]          \n\t"
+   "2:                                                 \n\t"
+    "mul.s      %[f1],     %[f1],       %[mu]          \n\t"
+    "mul.s      %[f2],     %[f2],       %[mu]          \n\t"
+    "swc1       %[f1],     0(%[ef0])                   \n\t"
+    "swc1       %[f2],     0(%[ef1])                   \n\t"
+    "addiu      %[len],    %[len],      -1             \n\t"
+    "addiu      %[xPow],   %[xPow],     4              \n\t"
+    "addiu      %[ef0],    %[ef0],      4              \n\t"
+    "bgtz       %[len],    1b                          \n\t"
+    " addiu     %[ef1],    %[ef1],      4              \n\t"
+    ".set       pop                                    \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
+#if !defined(MIPS32_R2_LE)
+      [f3] "=&f" (f3),
+#endif
+      [xPow] "+r" (xPow), [ef0] "+r" (ef0), [ef1] "+r" (ef1),
+      [len] "+r" (len)
+    : [fac1] "f" (fac1), [err_th2] "f" (err_th2), [mu] "f" (mu),
+      [err_th] "f" (error_threshold)
+    : "memory"
+  );
+}
+
+void WebRtcAec_InitAec_mips(void) {
+  WebRtcAec_FilterFar = WebRtcAec_FilterFar_mips;
+  WebRtcAec_FilterAdaptation = WebRtcAec_FilterAdaptation_mips;
+  WebRtcAec_ScaleErrorSignal = WebRtcAec_ScaleErrorSignal_mips;
+  WebRtcAec_ComfortNoise = WebRtcAec_ComfortNoise_mips;
+  WebRtcAec_OverdriveAndSuppress = WebRtcAec_OverdriveAndSuppress_mips;
+}
+
diff --git a/webrtc/modules/audio_processing/aec/aec_core_neon.c b/webrtc/modules/audio_processing/aec/aec_core_neon.c
new file mode 100644
index 0000000000..9a677aaa67
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_core_neon.c
@@ -0,0 +1,736 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The core AEC algorithm, neon version of speed-critical functions.
+ *
+ * Based on aec_core_sse2.c.
+ */
+
+#include <arm_neon.h>
+#include <math.h>
+#include <string.h>  // memset
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
+#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+
+enum { kShiftExponentIntoTopMantissa = 8 };
+enum { kFloatExponentShift = 23 };
+
+__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
+  return aRe * bRe - aIm * bIm;
+}
+
+__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
+  return aRe * bIm + aIm * bRe;
+}
+
+static void FilterFarNEON(AecCore* aec, float yf[2][PART_LEN1]) {
+  int i;
+  const int num_partitions = aec->num_partitions;
+  for (i = 0; i < num_partitions; i++) {
+    int j;
+    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
+    int pos = i * PART_LEN1;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= num_partitions) {
+      xPos -= num_partitions * PART_LEN1;
+    }
+
+    // vectorized code (four at once)
+    for (j = 0; j + 3 < PART_LEN1; j += 4) {
+      const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
+      const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
+      const float32x4_t wfBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
+      const float32x4_t wfBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
+      const float32x4_t yf_re = vld1q_f32(&yf[0][j]);
+      const float32x4_t yf_im = vld1q_f32(&yf[1][j]);
+      const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re);
+      const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im);
+      const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im);
+      const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re);
+      const float32x4_t g = vaddq_f32(yf_re, e);
+      const float32x4_t h = vaddq_f32(yf_im, f);
+      vst1q_f32(&yf[0][j], g);
+      vst1q_f32(&yf[1][j], h);
+    }
+    // scalar code for the remaining items.
+    for (; j < PART_LEN1; j++) {
+      yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
+                        aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][pos + j],
+                        aec->wfBuf[1][pos + j]);
+      yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
+                        aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][pos + j],
+                        aec->wfBuf[1][pos + j]);
+    }
+  }
+}
+
+// ARM64's arm_neon.h has already defined vdivq_f32 vsqrtq_f32.
+#if !defined (WEBRTC_ARCH_ARM64)
+static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) {
+  int i;
+  float32x4_t x = vrecpeq_f32(b);
+  // from arm documentation
+  // The Newton-Raphson iteration:
+  //     x[n+1] = x[n] * (2 - d * x[n])
+  // converges to (1/d) if x0 is the result of VRECPE applied to d.
+  //
+  // Note: The precision did not improve after 2 iterations.
+  for (i = 0; i < 2; i++) {
+    x = vmulq_f32(vrecpsq_f32(b, x), x);
+  }
+  // a/b = a*(1/b)
+  return vmulq_f32(a, x);
+}
+
+static float32x4_t vsqrtq_f32(float32x4_t s) {
+  int i;
+  float32x4_t x = vrsqrteq_f32(s);
+
+  // Code to handle sqrt(0).
+  // If the input to sqrtf() is zero, a zero will be returned.
+  // If the input to vrsqrteq_f32() is zero, positive infinity is returned.
+  const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000);
+  // check for divide by zero
+  const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x));
+  // zero out the positive infinity results
+  x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero),
+                                      vreinterpretq_u32_f32(x)));
+  // from arm documentation
+  // The Newton-Raphson iteration:
+  //     x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2)
+  // converges to (1/√d) if x0 is the result of VRSQRTE applied to d.
+  //
+  // Note: The precision did not improve after 2 iterations.
+  for (i = 0; i < 2; i++) {
+    x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x);
+  }
+  // sqrt(s) = s * 1/sqrt(s)
+  return vmulq_f32(s, x);;
+}
+#endif  // WEBRTC_ARCH_ARM64
+
+static void ScaleErrorSignalNEON(AecCore* aec, float ef[2][PART_LEN1]) {
+  const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
+  const float error_threshold = aec->extended_filter_enabled ?
+      kExtendedErrorThreshold : aec->normal_error_threshold;
+  const float32x4_t k1e_10f = vdupq_n_f32(1e-10f);
+  const float32x4_t kMu = vmovq_n_f32(mu);
+  const float32x4_t kThresh = vmovq_n_f32(error_threshold);
+  int i;
+  // vectorized code (four at once)
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    const float32x4_t xPow = vld1q_f32(&aec->xPow[i]);
+    const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]);
+    const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]);
+    const float32x4_t xPowPlus = vaddq_f32(xPow, k1e_10f);
+    float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus);
+    float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus);
+    const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re);
+    const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im);
+    const float32x4_t absEf = vsqrtq_f32(ef_sum2);
+    const uint32x4_t bigger = vcgtq_f32(absEf, kThresh);
+    const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f);
+    const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus);
+    uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv));
+    uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv));
+    uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger),
+                                     vreinterpretq_u32_f32(ef_re));
+    uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger),
+                                     vreinterpretq_u32_f32(ef_im));
+    ef_re_if = vandq_u32(bigger, ef_re_if);
+    ef_im_if = vandq_u32(bigger, ef_im_if);
+    ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if);
+    ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if);
+    ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu);
+    ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu);
+    vst1q_f32(&ef[0][i], ef_re);
+    vst1q_f32(&ef[1][i], ef_im);
+  }
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    float abs_ef;
+    ef[0][i] /= (aec->xPow[i] + 1e-10f);
+    ef[1][i] /= (aec->xPow[i] + 1e-10f);
+    abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
+
+    if (abs_ef > error_threshold) {
+      abs_ef = error_threshold / (abs_ef + 1e-10f);
+      ef[0][i] *= abs_ef;
+      ef[1][i] *= abs_ef;
+    }
+
+    // Stepsize factor
+    ef[0][i] *= mu;
+    ef[1][i] *= mu;
+  }
+}
+
+static void FilterAdaptationNEON(AecCore* aec,
+                                 float* fft,
+                                 float ef[2][PART_LEN1]) {
+  int i;
+  const int num_partitions = aec->num_partitions;
+  for (i = 0; i < num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
+    int pos = i * PART_LEN1;
+    int j;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= num_partitions) {
+      xPos -= num_partitions * PART_LEN1;
+    }
+
+    // Process the whole array...
+    for (j = 0; j < PART_LEN; j += 4) {
+      // Load xfBuf and ef.
+      const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
+      const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
+      const float32x4_t ef_re = vld1q_f32(&ef[0][j]);
+      const float32x4_t ef_im = vld1q_f32(&ef[1][j]);
+      // Calculate the product of conjugate(xfBuf) by ef.
+      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
+      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
+      const float32x4_t a = vmulq_f32(xfBuf_re, ef_re);
+      const float32x4_t e = vmlaq_f32(a, xfBuf_im, ef_im);
+      const float32x4_t c = vmulq_f32(xfBuf_re, ef_im);
+      const float32x4_t f = vmlsq_f32(c, xfBuf_im, ef_re);
+      // Interleave real and imaginary parts.
+      const float32x4x2_t g_n_h = vzipq_f32(e, f);
+      // Store
+      vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);
+      vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
+    }
+    // ... and fixup the first imaginary entry.
+    fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
+                   -aec->xfBuf[1][xPos + PART_LEN],
+                   ef[0][PART_LEN],
+                   ef[1][PART_LEN]);
+
+    aec_rdft_inverse_128(fft);
+    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
+
+    // fft scaling
+    {
+      const float scale = 2.0f / PART_LEN2;
+      const float32x4_t scale_ps = vmovq_n_f32(scale);
+      for (j = 0; j < PART_LEN; j += 4) {
+        const float32x4_t fft_ps = vld1q_f32(&fft[j]);
+        const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);
+        vst1q_f32(&fft[j], fft_scale);
+      }
+    }
+    aec_rdft_forward_128(fft);
+
+    {
+      const float wt1 = aec->wfBuf[1][pos];
+      aec->wfBuf[0][pos + PART_LEN] += fft[1];
+      for (j = 0; j < PART_LEN; j += 4) {
+        float32x4_t wtBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
+        float32x4_t wtBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
+        const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);
+        const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);
+        const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);
+        wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]);
+        wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]);
+
+        vst1q_f32(&aec->wfBuf[0][pos + j], wtBuf_re);
+        vst1q_f32(&aec->wfBuf[1][pos + j], wtBuf_im);
+      }
+      aec->wfBuf[1][pos] = wt1;
+    }
+  }
+}
+
+static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
+  // a^b = exp2(b * log2(a))
+  //   exp2(x) and log2(x) are calculated using polynomial approximations.
+  float32x4_t log2_a, b_log2_a, a_exp_b;
+
+  // Calculate log2(x), x = a.
+  {
+    // To calculate log2(x), we decompose x like this:
+    //   x = y * 2^n
+    //     n is an integer
+    //     y is in the [1.0, 2.0) range
+    //
+    //   log2(x) = log2(y) + n
+    //     n       can be evaluated by playing with float representation.
+    //     log2(y) in a small range can be approximated, this code uses an order
+    //             five polynomial approximation. The coefficients have been
+    //             estimated with the Remez algorithm and the resulting
+    //             polynomial has a maximum relative error of 0.00086%.
+
+    // Compute n.
+    //    This is done by masking the exponent, shifting it into the top bit of
+    //    the mantissa, putting eight into the biased exponent (to shift/
+    //    compensate the fact that the exponent has been shifted in the top/
+    //    fractional part and finally getting rid of the implicit leading one
+    //    from the mantissa by substracting it out.
+    const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
+    const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
+    const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
+    const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
+                                       vec_float_exponent_mask);
+    const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
+    const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
+    const float32x4_t n =
+        vsubq_f32(vreinterpretq_f32_u32(n_0),
+                  vreinterpretq_f32_u32(vec_implicit_leading_one));
+    // Compute y.
+    const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
+    const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
+    const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
+                                          vec_mantissa_mask);
+    const float32x4_t y =
+        vreinterpretq_f32_u32(vorrq_u32(mantissa,
+                                        vec_zero_biased_exponent_is_one));
+    // Approximate log2(y) ~= (y - 1) * pol5(y).
+    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
+    const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
+    const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
+    const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
+    const float32x4_t C2 = vdupq_n_f32(2.5988452f);
+    const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
+    const float32x4_t C0 = vdupq_n_f32(3.1157899f);
+    float32x4_t pol5_y = C5;
+    pol5_y = vmlaq_f32(C4, y, pol5_y);
+    pol5_y = vmlaq_f32(C3, y, pol5_y);
+    pol5_y = vmlaq_f32(C2, y, pol5_y);
+    pol5_y = vmlaq_f32(C1, y, pol5_y);
+    pol5_y = vmlaq_f32(C0, y, pol5_y);
+    const float32x4_t y_minus_one =
+        vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
+    const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);
+
+    // Combine parts.
+    log2_a = vaddq_f32(n, log2_y);
+  }
+
+  // b * log2(a)
+  b_log2_a = vmulq_f32(b, log2_a);
+
+  // Calculate exp2(x), x = b * log2(a).
+  {
+    // To calculate 2^x, we decompose x like this:
+    //   x = n + y
+    //     n is an integer, the value of x - 0.5 rounded down, therefore
+    //     y is in the [0.5, 1.5) range
+    //
+    //   2^x = 2^n * 2^y
+    //     2^n can be evaluated by playing with float representation.
+    //     2^y in a small range can be approximated, this code uses an order two
+    //         polynomial approximation. The coefficients have been estimated
+    //         with the Remez algorithm and the resulting polynomial has a
+    //         maximum relative error of 0.17%.
+    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
+    const float32x4_t max_input = vdupq_n_f32(129.f);
+    const float32x4_t min_input = vdupq_n_f32(-126.99999f);
+    const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
+    const float32x4_t x_max = vmaxq_f32(x_min, min_input);
+    // Compute n.
+    const float32x4_t half = vdupq_n_f32(0.5f);
+    const float32x4_t x_minus_half = vsubq_f32(x_max, half);
+    const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);
+
+    // Compute 2^n.
+    const int32x4_t float_exponent_bias = vdupq_n_s32(127);
+    const int32x4_t two_n_exponent =
+        vaddq_s32(x_minus_half_floor, float_exponent_bias);
+    const float32x4_t two_n =
+        vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
+    // Compute y.
+    const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));
+
+    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
+    const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
+    const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
+    const float32x4_t C0 = vdupq_n_f32(1.0017247f);
+    float32x4_t exp2_y = C2;
+    exp2_y = vmlaq_f32(C1, y, exp2_y);
+    exp2_y = vmlaq_f32(C0, y, exp2_y);
+
+    // Combine parts.
+    a_exp_b = vmulq_f32(exp2_y, two_n);
+  }
+
+  return a_exp_b;
+}
+
+static void OverdriveAndSuppressNEON(AecCore* aec,
+                                     float hNl[PART_LEN1],
+                                     const float hNlFb,
+                                     float efw[2][PART_LEN1]) {
+  int i;
+  const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb);
+  const float32x4_t vec_one = vdupq_n_f32(1.0f);
+  const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f);
+  const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm);
+
+  // vectorized code (four at once)
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    // Weight subbands
+    float32x4_t vec_hNl = vld1q_f32(&hNl[i]);
+    const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]);
+    const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb);
+    const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve,
+                                                        vec_hNlFb);
+    const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve);
+    const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve,
+                                                          vec_hNl);
+    const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger),
+                                         vreinterpretq_u32_f32(vec_hNl));
+    const float32x4_t vec_one_weightCurve_add =
+        vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl);
+    const uint32x4_t vec_if1 =
+        vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add));
+
+    vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1));
+
+    {
+      const float32x4_t vec_overDriveCurve =
+          vld1q_f32(&WebRtcAec_overDriveCurve[i]);
+      const float32x4_t vec_overDriveSm_overDriveCurve =
+          vmulq_f32(vec_overDriveSm, vec_overDriveCurve);
+      vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve);
+      vst1q_f32(&hNl[i], vec_hNl);
+    }
+
+    // Suppress error signal
+    {
+      float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]);
+      float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]);
+      vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl);
+      vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl);
+
+      // Ooura fft returns incorrect sign on imaginary component. It matters
+      // here because we are making an additive change with comfort noise.
+      vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one);
+      vst1q_f32(&efw[0][i], vec_efw_re);
+      vst1q_f32(&efw[1][i], vec_efw_im);
+    }
+  }
+
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    // Weight subbands
+    if (hNl[i] > hNlFb) {
+      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
+               (1 - WebRtcAec_weightCurve[i]) * hNl[i];
+    }
+
+    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
+
+    // Suppress error signal
+    efw[0][i] *= hNl[i];
+    efw[1][i] *= hNl[i];
+
+    // Ooura fft returns incorrect sign on imaginary component. It matters
+    // here because we are making an additive change with comfort noise.
+    efw[1][i] *= -1;
+  }
+}
+
+static int PartitionDelay(const AecCore* aec) {
+  // Measures the energy in each filter partition and returns the partition with
+  // highest energy.
+  // TODO(bjornv): Spread computational cost by computing one partition per
+  // block?
+  float wfEnMax = 0;
+  int i;
+  int delay = 0;
+
+  for (i = 0; i < aec->num_partitions; i++) {
+    int j;
+    int pos = i * PART_LEN1;
+    float wfEn = 0;
+    float32x4_t vec_wfEn = vdupq_n_f32(0.0f);
+    // vectorized code (four at once)
+    for (j = 0; j + 3 < PART_LEN1; j += 4) {
+      const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]);
+      const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]);
+      vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0);
+      vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1);
+    }
+    {
+      float32x2_t vec_total;
+      // A B C D
+      vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn));
+      // A+B C+D
+      vec_total = vpadd_f32(vec_total, vec_total);
+      // A+B+C+D A+B+C+D
+      wfEn = vget_lane_f32(vec_total, 0);
+    }
+
+    // scalar code for the remaining items.
+    for (; j < PART_LEN1; j++) {
+      wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
+              aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
+    }
+
+    if (wfEn > wfEnMax) {
+      wfEnMax = wfEn;
+      delay = i;
+    }
+  }
+  return delay;
+}
+
+// Updates the following smoothed  Power Spectral Densities (PSD):
+//  - sd  : near-end
+//  - se  : residual echo
+//  - sx  : far-end
+//  - sde : cross-PSD of near-end and residual echo
+//  - sxd : cross-PSD of near-end and far-end
+//
+// In addition to updating the PSDs, also the filter diverge state is determined
+// upon actions are taken.
+static void SmoothedPSD(AecCore* aec,
+                        float efw[2][PART_LEN1],
+                        float dfw[2][PART_LEN1],
+                        float xfw[2][PART_LEN1]) {
+  // Power estimate smoothing coefficients.
+  const float* ptrGCoh = aec->extended_filter_enabled
+      ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
+      : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
+  int i;
+  float sdSum = 0, seSum = 0;
+  const float32x4_t vec_15 =  vdupq_n_f32(WebRtcAec_kMinFarendPSD);
+  float32x4_t vec_sdSum = vdupq_n_f32(0.0f);
+  float32x4_t vec_seSum = vdupq_n_f32(0.0f);
+
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]);
+    const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]);
+    const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]);
+    const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]);
+    const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]);
+    const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]);
+    float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]);
+    float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]);
+    float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]);
+    float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0);
+    float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0);
+    float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0);
+
+    vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1);
+    vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1);
+    vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1);
+    vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15);
+    vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]);
+    vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]);
+    vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]);
+
+    vst1q_f32(&aec->sd[i], vec_sd);
+    vst1q_f32(&aec->se[i], vec_se);
+    vst1q_f32(&aec->sx[i], vec_sx);
+
+    {
+      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
+      float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0);
+      float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1);
+      vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]);
+      vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]);
+      vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1);
+      vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0);
+      vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]);
+      vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]);
+      vst2q_f32(&aec->sde[i][0], vec_sde);
+    }
+
+    {
+      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
+      float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0);
+      float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1);
+      vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]);
+      vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]);
+      vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1);
+      vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0);
+      vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]);
+      vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]);
+      vst2q_f32(&aec->sxd[i][0], vec_sxd);
+    }
+
+    vec_sdSum = vaddq_f32(vec_sdSum, vec_sd);
+    vec_seSum = vaddq_f32(vec_seSum, vec_se);
+  }
+  {
+    float32x2_t vec_sdSum_total;
+    float32x2_t vec_seSum_total;
+    // A B C D
+    vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum),
+                                vget_high_f32(vec_sdSum));
+    vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum),
+                                vget_high_f32(vec_seSum));
+    // A+B C+D
+    vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total);
+    vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total);
+    // A+B+C+D A+B+C+D
+    sdSum = vget_lane_f32(vec_sdSum_total, 0);
+    seSum = vget_lane_f32(vec_seSum_total, 0);
+  }
+
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
+                 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
+    aec->se[i] = ptrGCoh[0] * aec->se[i] +
+                 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
+    // We threshold here to protect against the ill-effects of a zero farend.
+    // The threshold is not arbitrarily chosen, but balances protection and
+    // adverse interaction with the algorithm's tuning.
+    // TODO(bjornv): investigate further why this is so sensitive.
+    aec->sx[i] =
+        ptrGCoh[0] * aec->sx[i] +
+        ptrGCoh[1] * WEBRTC_SPL_MAX(
+            xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
+            WebRtcAec_kMinFarendPSD);
+
+    aec->sde[i][0] =
+        ptrGCoh[0] * aec->sde[i][0] +
+        ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
+    aec->sde[i][1] =
+        ptrGCoh[0] * aec->sde[i][1] +
+        ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
+
+    aec->sxd[i][0] =
+        ptrGCoh[0] * aec->sxd[i][0] +
+        ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
+    aec->sxd[i][1] =
+        ptrGCoh[0] * aec->sxd[i][1] +
+        ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
+
+    sdSum += aec->sd[i];
+    seSum += aec->se[i];
+  }
+
+  // Divergent filter safeguard.
+  aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
+
+  if (aec->divergeState)
+    memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
+
+  // Reset if error is significantly larger than nearend (13 dB).
+  if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
+    memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
+}
+
+// Window time domain data to be used by the fft.
+__inline static void WindowData(float* x_windowed, const float* x) {
+  int i;
+  for (i = 0; i < PART_LEN; i += 4) {
+    const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);
+    const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]);
+    const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]);
+    // A B C D
+    float32x4_t vec_sqrtHanning_rev =
+        vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
+    // B A D C
+    vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);
+    // D C B A
+    vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),
+                                       vget_low_f32(vec_sqrtHanning_rev));
+    vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));
+    vst1q_f32(&x_windowed[PART_LEN + i],
+            vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));
+  }
+}
+
+// Puts fft output data into a complex valued array.
+__inline static void StoreAsComplex(const float* data,
+                                    float data_complex[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < PART_LEN; i += 4) {
+    const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]);
+    vst1q_f32(&data_complex[0][i], vec_data.val[0]);
+    vst1q_f32(&data_complex[1][i], vec_data.val[1]);
+  }
+  // fix beginning/end values
+  data_complex[1][0] = 0;
+  data_complex[1][PART_LEN] = 0;
+  data_complex[0][0] = data[0];
+  data_complex[0][PART_LEN] = data[1];
+}
+
+static void SubbandCoherenceNEON(AecCore* aec,
+                                 float efw[2][PART_LEN1],
+                                 float xfw[2][PART_LEN1],
+                                 float* fft,
+                                 float* cohde,
+                                 float* cohxd) {
+  float dfw[2][PART_LEN1];
+  int i;
+
+  if (aec->delayEstCtr == 0)
+    aec->delayIdx = PartitionDelay(aec);
+
+  // Use delayed far.
+  memcpy(xfw,
+         aec->xfwBuf + aec->delayIdx * PART_LEN1,
+         sizeof(xfw[0][0]) * 2 * PART_LEN1);
+
+  // Windowed near fft
+  WindowData(fft, aec->dBuf);
+  aec_rdft_forward_128(fft);
+  StoreAsComplex(fft, dfw);
+
+  // Windowed error fft
+  WindowData(fft, aec->eBuf);
+  aec_rdft_forward_128(fft);
+  StoreAsComplex(fft, efw);
+
+  SmoothedPSD(aec, efw, dfw, xfw);
+
+  {
+    const float32x4_t vec_1eminus10 =  vdupq_n_f32(1e-10f);
+
+    // Subband coherence
+    for (i = 0; i + 3 < PART_LEN1; i += 4) {
+      const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);
+      const float32x4_t vec_se = vld1q_f32(&aec->se[i]);
+      const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);
+      const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);
+      const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);
+      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
+      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
+      float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);
+      float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);
+      vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);
+      vec_cohde = vdivq_f32(vec_cohde, vec_sdse);
+      vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);
+      vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);
+
+      vst1q_f32(&cohde[i], vec_cohde);
+      vst1q_f32(&cohxd[i], vec_cohxd);
+    }
+  }
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    cohde[i] =
+        (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
+        (aec->sd[i] * aec->se[i] + 1e-10f);
+    cohxd[i] =
+        (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
+        (aec->sx[i] * aec->sd[i] + 1e-10f);
+  }
+}
+
+void WebRtcAec_InitAec_neon(void) {
+  WebRtcAec_FilterFar = FilterFarNEON;
+  WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON;
+  WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
+  WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
+  WebRtcAec_SubbandCoherence = SubbandCoherenceNEON;
+}
+
diff --git a/webrtc/modules/audio_processing/aec/aec_core_sse2.c b/webrtc/modules/audio_processing/aec/aec_core_sse2.c
new file mode 100644
index 0000000000..b1bffcbb9f
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_core_sse2.c
@@ -0,0 +1,731 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The core AEC algorithm, SSE2 version of speed-critical functions.
+ */
+
+#include <emmintrin.h>
+#include <math.h>
+#include <string.h>  // memset
+
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
+#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+
+__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
+  return aRe * bRe - aIm * bIm;
+}
+
+__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
+  return aRe * bIm + aIm * bRe;
+}
+
+static void FilterFarSSE2(AecCore* aec, float yf[2][PART_LEN1]) {
+  int i;
+  const int num_partitions = aec->num_partitions;
+  for (i = 0; i < num_partitions; i++) {
+    int j;
+    int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
+    int pos = i * PART_LEN1;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= num_partitions) {
+      xPos -= num_partitions * (PART_LEN1);
+    }
+
+    // vectorized code (four at once)
+    for (j = 0; j + 3 < PART_LEN1; j += 4) {
+      const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
+      const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
+      const __m128 wfBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
+      const __m128 wfBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
+      const __m128 yf_re = _mm_loadu_ps(&yf[0][j]);
+      const __m128 yf_im = _mm_loadu_ps(&yf[1][j]);
+      const __m128 a = _mm_mul_ps(xfBuf_re, wfBuf_re);
+      const __m128 b = _mm_mul_ps(xfBuf_im, wfBuf_im);
+      const __m128 c = _mm_mul_ps(xfBuf_re, wfBuf_im);
+      const __m128 d = _mm_mul_ps(xfBuf_im, wfBuf_re);
+      const __m128 e = _mm_sub_ps(a, b);
+      const __m128 f = _mm_add_ps(c, d);
+      const __m128 g = _mm_add_ps(yf_re, e);
+      const __m128 h = _mm_add_ps(yf_im, f);
+      _mm_storeu_ps(&yf[0][j], g);
+      _mm_storeu_ps(&yf[1][j], h);
+    }
+    // scalar code for the remaining items.
+    for (; j < PART_LEN1; j++) {
+      yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
+                        aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][pos + j],
+                        aec->wfBuf[1][pos + j]);
+      yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
+                        aec->xfBuf[1][xPos + j],
+                        aec->wfBuf[0][pos + j],
+                        aec->wfBuf[1][pos + j]);
+    }
+  }
+}
+
+static void ScaleErrorSignalSSE2(AecCore* aec, float ef[2][PART_LEN1]) {
+  const __m128 k1e_10f = _mm_set1_ps(1e-10f);
+  const __m128 kMu = aec->extended_filter_enabled ? _mm_set1_ps(kExtendedMu)
+                                                  : _mm_set1_ps(aec->normal_mu);
+  const __m128 kThresh = aec->extended_filter_enabled
+                             ? _mm_set1_ps(kExtendedErrorThreshold)
+                             : _mm_set1_ps(aec->normal_error_threshold);
+
+  int i;
+  // vectorized code (four at once)
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]);
+    const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);
+    const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);
+
+    const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f);
+    __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus);
+    __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus);
+    const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re);
+    const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im);
+    const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2);
+    const __m128 absEf = _mm_sqrt_ps(ef_sum2);
+    const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh);
+    __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f);
+    const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus);
+    __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv);
+    __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv);
+    ef_re_if = _mm_and_ps(bigger, ef_re_if);
+    ef_im_if = _mm_and_ps(bigger, ef_im_if);
+    ef_re = _mm_andnot_ps(bigger, ef_re);
+    ef_im = _mm_andnot_ps(bigger, ef_im);
+    ef_re = _mm_or_ps(ef_re, ef_re_if);
+    ef_im = _mm_or_ps(ef_im, ef_im_if);
+    ef_re = _mm_mul_ps(ef_re, kMu);
+    ef_im = _mm_mul_ps(ef_im, kMu);
+
+    _mm_storeu_ps(&ef[0][i], ef_re);
+    _mm_storeu_ps(&ef[1][i], ef_im);
+  }
+  // scalar code for the remaining items.
+  {
+    const float mu =
+        aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
+    const float error_threshold = aec->extended_filter_enabled
+                                      ? kExtendedErrorThreshold
+                                      : aec->normal_error_threshold;
+    for (; i < (PART_LEN1); i++) {
+      float abs_ef;
+      ef[0][i] /= (aec->xPow[i] + 1e-10f);
+      ef[1][i] /= (aec->xPow[i] + 1e-10f);
+      abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
+
+      if (abs_ef > error_threshold) {
+        abs_ef = error_threshold / (abs_ef + 1e-10f);
+        ef[0][i] *= abs_ef;
+        ef[1][i] *= abs_ef;
+      }
+
+      // Stepsize factor
+      ef[0][i] *= mu;
+      ef[1][i] *= mu;
+    }
+  }
+}
+
+static void FilterAdaptationSSE2(AecCore* aec,
+                                 float* fft,
+                                 float ef[2][PART_LEN1]) {
+  int i, j;
+  const int num_partitions = aec->num_partitions;
+  for (i = 0; i < num_partitions; i++) {
+    int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
+    int pos = i * PART_LEN1;
+    // Check for wrap
+    if (i + aec->xfBufBlockPos >= num_partitions) {
+      xPos -= num_partitions * PART_LEN1;
+    }
+
+    // Process the whole array...
+    for (j = 0; j < PART_LEN; j += 4) {
+      // Load xfBuf and ef.
+      const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
+      const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
+      const __m128 ef_re = _mm_loadu_ps(&ef[0][j]);
+      const __m128 ef_im = _mm_loadu_ps(&ef[1][j]);
+      // Calculate the product of conjugate(xfBuf) by ef.
+      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
+      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
+      const __m128 a = _mm_mul_ps(xfBuf_re, ef_re);
+      const __m128 b = _mm_mul_ps(xfBuf_im, ef_im);
+      const __m128 c = _mm_mul_ps(xfBuf_re, ef_im);
+      const __m128 d = _mm_mul_ps(xfBuf_im, ef_re);
+      const __m128 e = _mm_add_ps(a, b);
+      const __m128 f = _mm_sub_ps(c, d);
+      // Interleave real and imaginary parts.
+      const __m128 g = _mm_unpacklo_ps(e, f);
+      const __m128 h = _mm_unpackhi_ps(e, f);
+      // Store
+      _mm_storeu_ps(&fft[2 * j + 0], g);
+      _mm_storeu_ps(&fft[2 * j + 4], h);
+    }
+    // ... and fixup the first imaginary entry.
+    fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
+                   -aec->xfBuf[1][xPos + PART_LEN],
+                   ef[0][PART_LEN],
+                   ef[1][PART_LEN]);
+
+    aec_rdft_inverse_128(fft);
+    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
+
+    // fft scaling
+    {
+      float scale = 2.0f / PART_LEN2;
+      const __m128 scale_ps = _mm_load_ps1(&scale);
+      for (j = 0; j < PART_LEN; j += 4) {
+        const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
+        const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
+        _mm_storeu_ps(&fft[j], fft_scale);
+      }
+    }
+    aec_rdft_forward_128(fft);
+
+    {
+      float wt1 = aec->wfBuf[1][pos];
+      aec->wfBuf[0][pos + PART_LEN] += fft[1];
+      for (j = 0; j < PART_LEN; j += 4) {
+        __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
+        __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
+        const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
+        const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
+        const __m128 fft_re =
+            _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));
+        const __m128 fft_im =
+            _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
+        wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
+        wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
+        _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
+        _mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im);
+      }
+      aec->wfBuf[1][pos] = wt1;
+    }
+  }
+}
+
+static __m128 mm_pow_ps(__m128 a, __m128 b) {
+  // a^b = exp2(b * log2(a))
+  //   exp2(x) and log2(x) are calculated using polynomial approximations.
+  __m128 log2_a, b_log2_a, a_exp_b;
+
+  // Calculate log2(x), x = a.
+  {
+    // To calculate log2(x), we decompose x like this:
+    //   x = y * 2^n
+    //     n is an integer
+    //     y is in the [1.0, 2.0) range
+    //
+    //   log2(x) = log2(y) + n
+    //     n       can be evaluated by playing with float representation.
+    //     log2(y) in a small range can be approximated, this code uses an order
+    //             five polynomial approximation. The coefficients have been
+    //             estimated with the Remez algorithm and the resulting
+    //             polynomial has a maximum relative error of 0.00086%.
+
+    // Compute n.
+    //    This is done by masking the exponent, shifting it into the top bit of
+    //    the mantissa, putting eight into the biased exponent (to shift/
+    //    compensate the fact that the exponent has been shifted in the top/
+    //    fractional part and finally getting rid of the implicit leading one
+    //    from the mantissa by substracting it out.
+    static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {
+        0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
+    static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {
+        0x43800000, 0x43800000, 0x43800000, 0x43800000};
+    static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {
+        0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
+    static const int shift_exponent_into_top_mantissa = 8;
+    const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask));
+    const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(
+        _mm_castps_si128(two_n), shift_exponent_into_top_mantissa));
+    const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent));
+    const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one));
+
+    // Compute y.
+    static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {
+        0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
+    static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {
+        0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
+    const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask));
+    const __m128 y =
+        _mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one));
+
+    // Approximate log2(y) ~= (y - 1) * pol5(y).
+    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
+    static const ALIGN16_BEG float ALIGN16_END C5[4] = {
+        -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
+    static const ALIGN16_BEG float ALIGN16_END
+        C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
+    static const ALIGN16_BEG float ALIGN16_END
+        C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
+    static const ALIGN16_BEG float ALIGN16_END
+        C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
+    static const ALIGN16_BEG float ALIGN16_END
+        C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
+    static const ALIGN16_BEG float ALIGN16_END
+        C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
+    const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5));
+    const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4));
+    const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
+    const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3));
+    const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
+    const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2));
+    const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
+    const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1));
+    const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
+    const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0));
+    const __m128 y_minus_one =
+        _mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one));
+    const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y);
+
+    // Combine parts.
+    log2_a = _mm_add_ps(n, log2_y);
+  }
+
+  // b * log2(a)
+  b_log2_a = _mm_mul_ps(b, log2_a);
+
+  // Calculate exp2(x), x = b * log2(a).
+  {
+    // To calculate 2^x, we decompose x like this:
+    //   x = n + y
+    //     n is an integer, the value of x - 0.5 rounded down, therefore
+    //     y is in the [0.5, 1.5) range
+    //
+    //   2^x = 2^n * 2^y
+    //     2^n can be evaluated by playing with float representation.
+    //     2^y in a small range can be approximated, this code uses an order two
+    //         polynomial approximation. The coefficients have been estimated
+    //         with the Remez algorithm and the resulting polynomial has a
+    //         maximum relative error of 0.17%.
+
+    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
+    static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,
+                                                               129.f, 129.f};
+    static const ALIGN16_BEG float min_input[4] ALIGN16_END = {
+        -126.99999f, -126.99999f, -126.99999f, -126.99999f};
+    const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input));
+    const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input));
+    // Compute n.
+    static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f,
+                                                          0.5f, 0.5f};
+    const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half));
+    const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
+    // Compute 2^n.
+    static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {
+        127, 127, 127, 127};
+    static const int float_exponent_shift = 23;
+    const __m128i two_n_exponent =
+        _mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias));
+    const __m128 two_n =
+        _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));
+    // Compute y.
+    const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
+    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
+    static const ALIGN16_BEG float C2[4] ALIGN16_END = {
+        3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
+    static const ALIGN16_BEG float C1[4] ALIGN16_END = {
+        6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
+    static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f,
+                                                        1.0017247f, 1.0017247f};
+    const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2));
+    const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1));
+    const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
+    const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0));
+
+    // Combine parts.
+    a_exp_b = _mm_mul_ps(exp2_y, two_n);
+  }
+  return a_exp_b;
+}
+
+static void OverdriveAndSuppressSSE2(AecCore* aec,
+                                     float hNl[PART_LEN1],
+                                     const float hNlFb,
+                                     float efw[2][PART_LEN1]) {
+  int i;
+  const __m128 vec_hNlFb = _mm_set1_ps(hNlFb);
+  const __m128 vec_one = _mm_set1_ps(1.0f);
+  const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
+  const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
+  // vectorized code (four at once)
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    // Weight subbands
+    __m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
+    const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
+    const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
+    const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb);
+    const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
+    const __m128 vec_one_weightCurve_hNl =
+        _mm_mul_ps(vec_one_weightCurve, vec_hNl);
+    const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
+    const __m128 vec_if1 = _mm_and_ps(
+        bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
+    vec_hNl = _mm_or_ps(vec_if0, vec_if1);
+
+    {
+      const __m128 vec_overDriveCurve =
+          _mm_loadu_ps(&WebRtcAec_overDriveCurve[i]);
+      const __m128 vec_overDriveSm_overDriveCurve =
+          _mm_mul_ps(vec_overDriveSm, vec_overDriveCurve);
+      vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
+      _mm_storeu_ps(&hNl[i], vec_hNl);
+    }
+
+    // Suppress error signal
+    {
+      __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]);
+      __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]);
+      vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl);
+      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl);
+
+      // Ooura fft returns incorrect sign on imaginary component. It matters
+      // here because we are making an additive change with comfort noise.
+      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one);
+      _mm_storeu_ps(&efw[0][i], vec_efw_re);
+      _mm_storeu_ps(&efw[1][i], vec_efw_im);
+    }
+  }
+  // scalar code for the remaining items.
+  for (; i < PART_LEN1; i++) {
+    // Weight subbands
+    if (hNl[i] > hNlFb) {
+      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
+               (1 - WebRtcAec_weightCurve[i]) * hNl[i];
+    }
+    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
+
+    // Suppress error signal
+    efw[0][i] *= hNl[i];
+    efw[1][i] *= hNl[i];
+
+    // Ooura fft returns incorrect sign on imaginary component. It matters
+    // here because we are making an additive change with comfort noise.
+    efw[1][i] *= -1;
+  }
+}
+
+__inline static void _mm_add_ps_4x1(__m128 sum, float *dst) {
+  // A+B C+D
+  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)));
+  // A+B+C+D A+B+C+D
+  sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
+  _mm_store_ss(dst, sum);
+}
+static int PartitionDelay(const AecCore* aec) {
+  // Measures the energy in each filter partition and returns the partition with
+  // highest energy.
+  // TODO(bjornv): Spread computational cost by computing one partition per
+  // block?
+  float wfEnMax = 0;
+  int i;
+  int delay = 0;
+
+  for (i = 0; i < aec->num_partitions; i++) {
+    int j;
+    int pos = i * PART_LEN1;
+    float wfEn = 0;
+    __m128 vec_wfEn = _mm_set1_ps(0.0f);
+    // vectorized code (four at once)
+    for (j = 0; j + 3 < PART_LEN1; j += 4) {
+      const __m128 vec_wfBuf0 = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
+      const __m128 vec_wfBuf1 = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
+      vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0));
+      vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1));
+    }
+    _mm_add_ps_4x1(vec_wfEn, &wfEn);
+
+    // scalar code for the remaining items.
+    for (; j < PART_LEN1; j++) {
+      wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
+              aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
+    }
+
+    if (wfEn > wfEnMax) {
+      wfEnMax = wfEn;
+      delay = i;
+    }
+  }
+  return delay;
+}
+
+// Updates the following smoothed  Power Spectral Densities (PSD):
+//  - sd  : near-end
+//  - se  : residual echo
+//  - sx  : far-end
+//  - sde : cross-PSD of near-end and residual echo
+//  - sxd : cross-PSD of near-end and far-end
+//
+// In addition to updating the PSDs, also the filter diverge state is determined
+// upon actions are taken.
+static void SmoothedPSD(AecCore* aec,
+                        float efw[2][PART_LEN1],
+                        float dfw[2][PART_LEN1],
+                        float xfw[2][PART_LEN1]) {
+  // Power estimate smoothing coefficients.
+  const float* ptrGCoh = aec->extended_filter_enabled
+      ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
+      : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
+  int i;
+  float sdSum = 0, seSum = 0;
+  const __m128 vec_15 =  _mm_set1_ps(WebRtcAec_kMinFarendPSD);
+  const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);
+  const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);
+  __m128 vec_sdSum = _mm_set1_ps(0.0f);
+  __m128 vec_seSum = _mm_set1_ps(0.0f);
+
+  for (i = 0; i + 3 < PART_LEN1; i += 4) {
+    const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);
+    const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);
+    const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);
+    const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);
+    const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]);
+    const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]);
+    __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0);
+    __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0);
+    __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0);
+    __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0);
+    __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0);
+    __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0);
+    vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1));
+    vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1));
+    vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1));
+    vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15);
+    vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));
+    vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));
+    vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));
+    _mm_storeu_ps(&aec->sd[i], vec_sd);
+    _mm_storeu_ps(&aec->se[i], vec_se);
+    _mm_storeu_ps(&aec->sx[i], vec_sx);
+
+    {
+      const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]);
+      const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
+      __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
+                                    _MM_SHUFFLE(2, 0, 2, 0));
+      __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
+                                    _MM_SHUFFLE(3, 1, 3, 1));
+      __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);
+      __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);
+      vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
+      vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
+      vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011,
+                                  _mm_mul_ps(vec_dfw1, vec_efw1));
+      vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110,
+                                  _mm_mul_ps(vec_dfw1, vec_efw0));
+      vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));
+      vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));
+      _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));
+      _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
+    }
+
+    {
+      const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
+      const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
+      __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
+                                    _MM_SHUFFLE(2, 0, 2, 0));
+      __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
+                                    _MM_SHUFFLE(3, 1, 3, 1));
+      __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);
+      __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);
+      vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
+      vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
+      vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011,
+                                  _mm_mul_ps(vec_dfw1, vec_xfw1));
+      vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110,
+                                  _mm_mul_ps(vec_dfw1, vec_xfw0));
+      vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));
+      vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));
+      _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));
+      _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
+    }
+
+    vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);
+    vec_seSum = _mm_add_ps(vec_seSum, vec_se);
+  }
+
+  _mm_add_ps_4x1(vec_sdSum, &sdSum);
+  _mm_add_ps_4x1(vec_seSum, &seSum);
+
+  for (; i < PART_LEN1; i++) {
+    aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
+                 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
+    aec->se[i] = ptrGCoh[0] * aec->se[i] +
+                 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
+    // We threshold here to protect against the ill-effects of a zero farend.
+    // The threshold is not arbitrarily chosen, but balances protection and
+    // adverse interaction with the algorithm's tuning.
+    // TODO(bjornv): investigate further why this is so sensitive.
+    aec->sx[i] =
+        ptrGCoh[0] * aec->sx[i] +
+        ptrGCoh[1] * WEBRTC_SPL_MAX(
+            xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
+            WebRtcAec_kMinFarendPSD);
+
+    aec->sde[i][0] =
+        ptrGCoh[0] * aec->sde[i][0] +
+        ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
+    aec->sde[i][1] =
+        ptrGCoh[0] * aec->sde[i][1] +
+        ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
+
+    aec->sxd[i][0] =
+        ptrGCoh[0] * aec->sxd[i][0] +
+        ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
+    aec->sxd[i][1] =
+        ptrGCoh[0] * aec->sxd[i][1] +
+        ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
+
+    sdSum += aec->sd[i];
+    seSum += aec->se[i];
+  }
+
+  // Divergent filter safeguard.
+  aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
+
+  if (aec->divergeState)
+    memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
+
+  // Reset if error is significantly larger than nearend (13 dB).
+  if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
+    memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
+}
+
+// Window time domain data to be used by the fft.
+__inline static void WindowData(float* x_windowed, const float* x) {
+  int i;
+  for (i = 0; i < PART_LEN; i += 4) {
+    const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]);
+    const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]);
+    const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]);
+    // A B C D
+    __m128 vec_sqrtHanning_rev =
+        _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
+    // D C B A
+    vec_sqrtHanning_rev =
+        _mm_shuffle_ps(vec_sqrtHanning_rev, vec_sqrtHanning_rev,
+                       _MM_SHUFFLE(0, 1, 2, 3));
+    _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning));
+    _mm_storeu_ps(&x_windowed[PART_LEN + i],
+                  _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev));
+  }
+}
+
+// Puts fft output data into a complex valued array.
+__inline static void StoreAsComplex(const float* data,
+                                    float data_complex[2][PART_LEN1]) {
+  int i;
+  for (i = 0; i < PART_LEN; i += 4) {
+    const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]);
+    const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]);
+    const __m128 vec_a = _mm_shuffle_ps(vec_fft0, vec_fft4,
+                                        _MM_SHUFFLE(2, 0, 2, 0));
+    const __m128 vec_b = _mm_shuffle_ps(vec_fft0, vec_fft4,
+                                        _MM_SHUFFLE(3, 1, 3, 1));
+    _mm_storeu_ps(&data_complex[0][i], vec_a);
+    _mm_storeu_ps(&data_complex[1][i], vec_b);
+  }
+  // fix beginning/end values
+  data_complex[1][0] = 0;
+  data_complex[1][PART_LEN] = 0;
+  data_complex[0][0] = data[0];
+  data_complex[0][PART_LEN] = data[1];
+}
+
+static void SubbandCoherenceSSE2(AecCore* aec,
+                                 float efw[2][PART_LEN1],
+                                 float xfw[2][PART_LEN1],
+                                 float* fft,
+                                 float* cohde,
+                                 float* cohxd) {
+  float dfw[2][PART_LEN1];
+  int i;
+
+  if (aec->delayEstCtr == 0)
+    aec->delayIdx = PartitionDelay(aec);
+
+  // Use delayed far.
+  memcpy(xfw,
+         aec->xfwBuf + aec->delayIdx * PART_LEN1,
+         sizeof(xfw[0][0]) * 2 * PART_LEN1);
+
+  // Windowed near fft
+  WindowData(fft, aec->dBuf);
+  aec_rdft_forward_128(fft);
+  StoreAsComplex(fft, dfw);
+
+  // Windowed error fft
+  WindowData(fft, aec->eBuf);
+  aec_rdft_forward_128(fft);
+  StoreAsComplex(fft, efw);
+
+  SmoothedPSD(aec, efw, dfw, xfw);
+
+  {
+    const __m128 vec_1eminus10 =  _mm_set1_ps(1e-10f);
+
+    // Subband coherence
+    for (i = 0; i + 3 < PART_LEN1; i += 4) {
+      const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]);
+      const __m128 vec_se = _mm_loadu_ps(&aec->se[i]);
+      const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]);
+      const __m128 vec_sdse = _mm_add_ps(vec_1eminus10,
+                                         _mm_mul_ps(vec_sd, vec_se));
+      const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10,
+                                         _mm_mul_ps(vec_sd, vec_sx));
+      const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]);
+      const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
+      const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
+      const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
+      const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
+                                              _MM_SHUFFLE(2, 0, 2, 0));
+      const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654,
+                                              _MM_SHUFFLE(3, 1, 3, 1));
+      const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
+                                              _MM_SHUFFLE(2, 0, 2, 0));
+      const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654,
+                                              _MM_SHUFFLE(3, 1, 3, 1));
+      __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);
+      __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);
+      vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));
+      vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);
+      vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));
+      vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);
+      _mm_storeu_ps(&cohde[i], vec_cohde);
+      _mm_storeu_ps(&cohxd[i], vec_cohxd);
+    }
+
+    // scalar code for the remaining items.
+    for (; i < PART_LEN1; i++) {
+      cohde[i] =
+          (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
+          (aec->sd[i] * aec->se[i] + 1e-10f);
+      cohxd[i] =
+          (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
+          (aec->sx[i] * aec->sd[i] + 1e-10f);
+    }
+  }
+}
+
+void WebRtcAec_InitAec_SSE2(void) {
+  WebRtcAec_FilterFar = FilterFarSSE2;
+  WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
+  WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
+  WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
+  WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;
+}
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft.c b/webrtc/modules/audio_processing/aec/aec_rdft.c
new file mode 100644
index 0000000000..03efc103ea
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_rdft.c
@@ -0,0 +1,589 @@
+/*
+ * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
+ * Copyright Takuya OOURA, 1996-2001
+ *
+ * You may use, copy, modify and distribute this code for any purpose (include
+ * commercial use) and without fee. Please refer to this package when you modify
+ * this code.
+ *
+ * Changes by the WebRTC authors:
+ *    - Trivial type modifications.
+ *    - Minimal code subset to do rdft of length 128.
+ *    - Optimizations because of known length.
+ *
+ *  All changes are covered by the WebRTC license and IP grant:
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+
+#include <math.h>
+
+#include "webrtc/system_wrappers/include/cpu_features_wrapper.h"
+#include "webrtc/typedefs.h"
+
+// These tables used to be computed at run-time. For example, refer to:
+// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/aec/aec_rdft.c?r=6564
+// to see the initialization code.
+const float rdft_w[64] = {
+    1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f,
+    0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f,
+    0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f,
+    0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f,
+    0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
+    0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f,
+    0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f,
+    0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f,
+    0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f,
+    0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
+    0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f,
+    0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f,
+    0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f,
+    0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f,
+    0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
+    0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
+};
+const float rdft_wk3ri_first[16] = {
+    1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
+    0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
+    0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
+    0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
+};
+const float rdft_wk3ri_second[16] = {
+    -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
+    -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
+    -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
+    -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
+    1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f,
+    0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f,
+    0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f,
+    0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f,
+    0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
+    0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f,
+    0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f,
+    0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
+    1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f,
+    0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f,
+    0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f,
+    0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
+    0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
+    0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f,
+    0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f,
+    0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
+    1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f,
+    0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
+    0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f,
+    -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f,
+    0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
+    0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f,
+    0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f,
+    -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
+    -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
+    -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
+    -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f,
+    -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f,
+    -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
+    -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f,
+    -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f,
+    -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
+    -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f,
+    -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f,
+    -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
+    -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f,
+    -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
+    -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f,
+    -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f,
+    -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
+    -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
+    -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f,
+    -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f,
+    -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f,
+    -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
+    -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f,
+    -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f,
+    -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f,
+};
+ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
+    0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f,
+};
+
+static void bitrv2_128_C(float* a) {
+  /*
+      Following things have been attempted but are no faster:
+      (a) Storing the swap indexes in a LUT (index calculations are done
+          for 'free' while waiting on memory/L1).
+      (b) Consolidate the load/store of two consecutive floats by a 64 bit
+          integer (execution is memory/L1 bound).
+      (c) Do a mix of floats and 64 bit integer to maximize register
+          utilization (execution is memory/L1 bound).
+      (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
+      (e) Hard-coding of the offsets to completely eliminates index
+          calculations.
+  */
+
+  unsigned int j, j1, k, k1;
+  float xr, xi, yr, yi;
+
+  static const int ip[4] = {0, 64, 32, 96};
+  for (k = 0; k < 4; k++) {
+    for (j = 0; j < k; j++) {
+      j1 = 2 * j + ip[k];
+      k1 = 2 * k + ip[j];
+      xr = a[j1 + 0];
+      xi = a[j1 + 1];
+      yr = a[k1 + 0];
+      yi = a[k1 + 1];
+      a[j1 + 0] = yr;
+      a[j1 + 1] = yi;
+      a[k1 + 0] = xr;
+      a[k1 + 1] = xi;
+      j1 += 8;
+      k1 += 16;
+      xr = a[j1 + 0];
+      xi = a[j1 + 1];
+      yr = a[k1 + 0];
+      yi = a[k1 + 1];
+      a[j1 + 0] = yr;
+      a[j1 + 1] = yi;
+      a[k1 + 0] = xr;
+      a[k1 + 1] = xi;
+      j1 += 8;
+      k1 -= 8;
+      xr = a[j1 + 0];
+      xi = a[j1 + 1];
+      yr = a[k1 + 0];
+      yi = a[k1 + 1];
+      a[j1 + 0] = yr;
+      a[j1 + 1] = yi;
+      a[k1 + 0] = xr;
+      a[k1 + 1] = xi;
+      j1 += 8;
+      k1 += 16;
+      xr = a[j1 + 0];
+      xi = a[j1 + 1];
+      yr = a[k1 + 0];
+      yi = a[k1 + 1];
+      a[j1 + 0] = yr;
+      a[j1 + 1] = yi;
+      a[k1 + 0] = xr;
+      a[k1 + 1] = xi;
+    }
+    j1 = 2 * k + 8 + ip[k];
+    k1 = j1 + 8;
+    xr = a[j1 + 0];
+    xi = a[j1 + 1];
+    yr = a[k1 + 0];
+    yi = a[k1 + 1];
+    a[j1 + 0] = yr;
+    a[j1 + 1] = yi;
+    a[k1 + 0] = xr;
+    a[k1 + 1] = xi;
+  }
+}
+
+static void cft1st_128_C(float* a) {
+  const int n = 128;
+  int j, k1, k2;
+  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  // The processing of the first set of elements was simplified in C to avoid
+  // some operations (multiplication by zero or one, addition of two elements
+  // multiplied by the same weight, ...).
+  x0r = a[0] + a[2];
+  x0i = a[1] + a[3];
+  x1r = a[0] - a[2];
+  x1i = a[1] - a[3];
+  x2r = a[4] + a[6];
+  x2i = a[5] + a[7];
+  x3r = a[4] - a[6];
+  x3i = a[5] - a[7];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[4] = x0r - x2r;
+  a[5] = x0i - x2i;
+  a[2] = x1r - x3i;
+  a[3] = x1i + x3r;
+  a[6] = x1r + x3i;
+  a[7] = x1i - x3r;
+  wk1r = rdft_w[2];
+  x0r = a[8] + a[10];
+  x0i = a[9] + a[11];
+  x1r = a[8] - a[10];
+  x1i = a[9] - a[11];
+  x2r = a[12] + a[14];
+  x2i = a[13] + a[15];
+  x3r = a[12] - a[14];
+  x3i = a[13] - a[15];
+  a[8] = x0r + x2r;
+  a[9] = x0i + x2i;
+  a[12] = x2i - x0i;
+  a[13] = x0r - x2r;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[10] = wk1r * (x0r - x0i);
+  a[11] = wk1r * (x0r + x0i);
+  x0r = x3i + x1r;
+  x0i = x3r - x1i;
+  a[14] = wk1r * (x0i - x0r);
+  a[15] = wk1r * (x0i + x0r);
+  k1 = 0;
+  for (j = 16; j < n; j += 16) {
+    k1 += 2;
+    k2 = 2 * k1;
+    wk2r = rdft_w[k1 + 0];
+    wk2i = rdft_w[k1 + 1];
+    wk1r = rdft_w[k2 + 0];
+    wk1i = rdft_w[k2 + 1];
+    wk3r = rdft_wk3ri_first[k1 + 0];
+    wk3i = rdft_wk3ri_first[k1 + 1];
+    x0r = a[j + 0] + a[j + 2];
+    x0i = a[j + 1] + a[j + 3];
+    x1r = a[j + 0] - a[j + 2];
+    x1i = a[j + 1] - a[j + 3];
+    x2r = a[j + 4] + a[j + 6];
+    x2i = a[j + 5] + a[j + 7];
+    x3r = a[j + 4] - a[j + 6];
+    x3i = a[j + 5] - a[j + 7];
+    a[j + 0] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    x0r -= x2r;
+    x0i -= x2i;
+    a[j + 4] = wk2r * x0r - wk2i * x0i;
+    a[j + 5] = wk2r * x0i + wk2i * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j + 2] = wk1r * x0r - wk1i * x0i;
+    a[j + 3] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j + 6] = wk3r * x0r - wk3i * x0i;
+    a[j + 7] = wk3r * x0i + wk3i * x0r;
+    wk1r = rdft_w[k2 + 2];
+    wk1i = rdft_w[k2 + 3];
+    wk3r = rdft_wk3ri_second[k1 + 0];
+    wk3i = rdft_wk3ri_second[k1 + 1];
+    x0r = a[j + 8] + a[j + 10];
+    x0i = a[j + 9] + a[j + 11];
+    x1r = a[j + 8] - a[j + 10];
+    x1i = a[j + 9] - a[j + 11];
+    x2r = a[j + 12] + a[j + 14];
+    x2i = a[j + 13] + a[j + 15];
+    x3r = a[j + 12] - a[j + 14];
+    x3i = a[j + 13] - a[j + 15];
+    a[j + 8] = x0r + x2r;
+    a[j + 9] = x0i + x2i;
+    x0r -= x2r;
+    x0i -= x2i;
+    a[j + 12] = -wk2i * x0r - wk2r * x0i;
+    a[j + 13] = -wk2i * x0i + wk2r * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j + 10] = wk1r * x0r - wk1i * x0i;
+    a[j + 11] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j + 14] = wk3r * x0r - wk3i * x0i;
+    a[j + 15] = wk3r * x0i + wk3i * x0r;
+  }
+}
+
+static void cftmdl_128_C(float* a) {
+  const int l = 8;
+  const int n = 128;
+  const int m = 32;
+  int j0, j1, j2, j3, k, k1, k2, m2;
+  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  for (j0 = 0; j0 < l; j0 += 2) {
+    j1 = j0 + 8;
+    j2 = j0 + 16;
+    j3 = j0 + 24;
+    x0r = a[j0 + 0] + a[j1 + 0];
+    x0i = a[j0 + 1] + a[j1 + 1];
+    x1r = a[j0 + 0] - a[j1 + 0];
+    x1i = a[j0 + 1] - a[j1 + 1];
+    x2r = a[j2 + 0] + a[j3 + 0];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2 + 0] - a[j3 + 0];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j0 + 0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j2 + 0] = x0r - x2r;
+    a[j2 + 1] = x0i - x2i;
+    a[j1 + 0] = x1r - x3i;
+    a[j1 + 1] = x1i + x3r;
+    a[j3 + 0] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+  }
+  wk1r = rdft_w[2];
+  for (j0 = m; j0 < l + m; j0 += 2) {
+    j1 = j0 + 8;
+    j2 = j0 + 16;
+    j3 = j0 + 24;
+    x0r = a[j0 + 0] + a[j1 + 0];
+    x0i = a[j0 + 1] + a[j1 + 1];
+    x1r = a[j0 + 0] - a[j1 + 0];
+    x1i = a[j0 + 1] - a[j1 + 1];
+    x2r = a[j2 + 0] + a[j3 + 0];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2 + 0] - a[j3 + 0];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j0 + 0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j2 + 0] = x2i - x0i;
+    a[j2 + 1] = x0r - x2r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j1 + 0] = wk1r * (x0r - x0i);
+    a[j1 + 1] = wk1r * (x0r + x0i);
+    x0r = x3i + x1r;
+    x0i = x3r - x1i;
+    a[j3 + 0] = wk1r * (x0i - x0r);
+    a[j3 + 1] = wk1r * (x0i + x0r);
+  }
+  k1 = 0;
+  m2 = 2 * m;
+  for (k = m2; k < n; k += m2) {
+    k1 += 2;
+    k2 = 2 * k1;
+    wk2r = rdft_w[k1 + 0];
+    wk2i = rdft_w[k1 + 1];
+    wk1r = rdft_w[k2 + 0];
+    wk1i = rdft_w[k2 + 1];
+    wk3r = rdft_wk3ri_first[k1 + 0];
+    wk3i = rdft_wk3ri_first[k1 + 1];
+    for (j0 = k; j0 < l + k; j0 += 2) {
+      j1 = j0 + 8;
+      j2 = j0 + 16;
+      j3 = j0 + 24;
+      x0r = a[j0 + 0] + a[j1 + 0];
+      x0i = a[j0 + 1] + a[j1 + 1];
+      x1r = a[j0 + 0] - a[j1 + 0];
+      x1i = a[j0 + 1] - a[j1 + 1];
+      x2r = a[j2 + 0] + a[j3 + 0];
+      x2i = a[j2 + 1] + a[j3 + 1];
+      x3r = a[j2 + 0] - a[j3 + 0];
+      x3i = a[j2 + 1] - a[j3 + 1];
+      a[j0 + 0] = x0r + x2r;
+      a[j0 + 1] = x0i + x2i;
+      x0r -= x2r;
+      x0i -= x2i;
+      a[j2 + 0] = wk2r * x0r - wk2i * x0i;
+      a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j1 + 0] = wk1r * x0r - wk1i * x0i;
+      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3 + 0] = wk3r * x0r - wk3i * x0i;
+      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+    }
+    wk1r = rdft_w[k2 + 2];
+    wk1i = rdft_w[k2 + 3];
+    wk3r = rdft_wk3ri_second[k1 + 0];
+    wk3i = rdft_wk3ri_second[k1 + 1];
+    for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
+      j1 = j0 + 8;
+      j2 = j0 + 16;
+      j3 = j0 + 24;
+      x0r = a[j0 + 0] + a[j1 + 0];
+      x0i = a[j0 + 1] + a[j1 + 1];
+      x1r = a[j0 + 0] - a[j1 + 0];
+      x1i = a[j0 + 1] - a[j1 + 1];
+      x2r = a[j2 + 0] + a[j3 + 0];
+      x2i = a[j2 + 1] + a[j3 + 1];
+      x3r = a[j2 + 0] - a[j3 + 0];
+      x3i = a[j2 + 1] - a[j3 + 1];
+      a[j0 + 0] = x0r + x2r;
+      a[j0 + 1] = x0i + x2i;
+      x0r -= x2r;
+      x0i -= x2i;
+      a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
+      a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
+      x0r = x1r - x3i;
+      x0i = x1i + x3r;
+      a[j1 + 0] = wk1r * x0r - wk1i * x0i;
+      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+      x0r = x1r + x3i;
+      x0i = x1i - x3r;
+      a[j3 + 0] = wk3r * x0r - wk3i * x0i;
+      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+    }
+  }
+}
+
+static void cftfsub_128_C(float* a) {
+  int j, j1, j2, j3, l;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+  l = 32;
+  for (j = 0; j < l; j += 2) {
+    j1 = j + l;
+    j2 = j1 + l;
+    j3 = j2 + l;
+    x0r = a[j] + a[j1];
+    x0i = a[j + 1] + a[j1 + 1];
+    x1r = a[j] - a[j1];
+    x1i = a[j + 1] - a[j1 + 1];
+    x2r = a[j2] + a[j3];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2] - a[j3];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    a[j2] = x0r - x2r;
+    a[j2 + 1] = x0i - x2i;
+    a[j1] = x1r - x3i;
+    a[j1 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+  }
+}
+
+static void cftbsub_128_C(float* a) {
+  int j, j1, j2, j3, l;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+  l = 32;
+
+  for (j = 0; j < l; j += 2) {
+    j1 = j + l;
+    j2 = j1 + l;
+    j3 = j2 + l;
+    x0r = a[j] + a[j1];
+    x0i = -a[j + 1] - a[j1 + 1];
+    x1r = a[j] - a[j1];
+    x1i = -a[j + 1] + a[j1 + 1];
+    x2r = a[j2] + a[j3];
+    x2i = a[j2 + 1] + a[j3 + 1];
+    x3r = a[j2] - a[j3];
+    x3i = a[j2 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i - x2i;
+    a[j2] = x0r - x2r;
+    a[j2 + 1] = x0i + x2i;
+    a[j1] = x1r - x3i;
+    a[j1 + 1] = x1i - x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i + x3r;
+  }
+}
+
+static void rftfsub_128_C(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 = 32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr - wki * xi;
+    yi = wkr * xi + wki * xr;
+    a[j2 + 0] -= yr;
+    a[j2 + 1] -= yi;
+    a[k2 + 0] += yr;
+    a[k2 + 1] -= yi;
+  }
+}
+
+static void rftbsub_128_C(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  a[1] = -a[1];
+  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 = 32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr + wki * xi;
+    yi = wkr * xi - wki * xr;
+    a[j2 + 0] = a[j2 + 0] - yr;
+    a[j2 + 1] = yi - a[j2 + 1];
+    a[k2 + 0] = yr + a[k2 + 0];
+    a[k2 + 1] = yi - a[k2 + 1];
+  }
+  a[65] = -a[65];
+}
+
+void aec_rdft_forward_128(float* a) {
+  float xi;
+  bitrv2_128(a);
+  cftfsub_128(a);
+  rftfsub_128(a);
+  xi = a[0] - a[1];
+  a[0] += a[1];
+  a[1] = xi;
+}
+
+void aec_rdft_inverse_128(float* a) {
+  a[1] = 0.5f * (a[0] - a[1]);
+  a[0] -= a[1];
+  rftbsub_128(a);
+  bitrv2_128(a);
+  cftbsub_128(a);
+}
+
+// code path selection
+RftSub128 cft1st_128;
+RftSub128 cftmdl_128;
+RftSub128 rftfsub_128;
+RftSub128 rftbsub_128;
+RftSub128 cftfsub_128;
+RftSub128 cftbsub_128;
+RftSub128 bitrv2_128;
+
+void aec_rdft_init(void) {
+  cft1st_128 = cft1st_128_C;
+  cftmdl_128 = cftmdl_128_C;
+  rftfsub_128 = rftfsub_128_C;
+  rftbsub_128 = rftbsub_128_C;
+  cftfsub_128 = cftfsub_128_C;
+  cftbsub_128 = cftbsub_128_C;
+  bitrv2_128 = bitrv2_128_C;
+#if defined(WEBRTC_ARCH_X86_FAMILY)
+  if (WebRtc_GetCPUInfo(kSSE2)) {
+    aec_rdft_init_sse2();
+  }
+#endif
+#if defined(MIPS_FPU_LE)
+  aec_rdft_init_mips();
+#endif
+#if defined(WEBRTC_HAS_NEON)
+  aec_rdft_init_neon();
+#elif defined(WEBRTC_DETECT_NEON)
+  if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
+    aec_rdft_init_neon();
+  }
+#endif
+}
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft.h b/webrtc/modules/audio_processing/aec/aec_rdft.h
new file mode 100644
index 0000000000..18eb7a5c3f
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_rdft.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
+
+#include "webrtc/modules/audio_processing/aec/aec_common.h"
+
+// These intrinsics were unavailable before VS 2008.
+// TODO(andrew): move to a common file.
+#if defined(_MSC_VER) && _MSC_VER < 1500
+#include <emmintrin.h>
+static __inline __m128 _mm_castsi128_ps(__m128i a) { return *(__m128*)&a; }
+static __inline __m128i _mm_castps_si128(__m128 a) { return *(__m128i*)&a; }
+#endif
+
+// Constants shared by all paths (C, SSE2, NEON).
+extern const float rdft_w[64];
+// Constants used by the C path.
+extern const float rdft_wk3ri_first[16];
+extern const float rdft_wk3ri_second[16];
+// Constants used by SSE2 and NEON but initialized in the C path.
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32];
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32];
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32];
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32];
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32];
+extern ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32];
+extern ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4];
+
+// code path selection function pointers
+typedef void (*RftSub128)(float* a);
+extern RftSub128 rftfsub_128;
+extern RftSub128 rftbsub_128;
+extern RftSub128 cft1st_128;
+extern RftSub128 cftmdl_128;
+extern RftSub128 cftfsub_128;
+extern RftSub128 cftbsub_128;
+extern RftSub128 bitrv2_128;
+
+// entry points
+void aec_rdft_init(void);
+void aec_rdft_init_sse2(void);
+void aec_rdft_forward_128(float* a);
+void aec_rdft_inverse_128(float* a);
+
+#if defined(MIPS_FPU_LE)
+void aec_rdft_init_mips(void);
+#endif
+#if defined(WEBRTC_DETECT_NEON) || defined(WEBRTC_HAS_NEON)
+void aec_rdft_init_neon(void);
+#endif
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft_mips.c b/webrtc/modules/audio_processing/aec/aec_rdft_mips.c
new file mode 100644
index 0000000000..7e64e65716
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_rdft_mips.c
@@ -0,0 +1,1187 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+#include "webrtc/typedefs.h"
+
+static void bitrv2_128_mips(float* a) {
+  // n is 128
+  float xr, xi, yr, yi;
+
+  xr = a[8];
+  xi = a[9];
+  yr = a[16];
+  yi = a[17];
+  a[8] = yr;
+  a[9] = yi;
+  a[16] = xr;
+  a[17] = xi;
+
+  xr = a[64];
+  xi = a[65];
+  yr = a[2];
+  yi = a[3];
+  a[64] = yr;
+  a[65] = yi;
+  a[2] = xr;
+  a[3] = xi;
+
+  xr = a[72];
+  xi = a[73];
+  yr = a[18];
+  yi = a[19];
+  a[72] = yr;
+  a[73] = yi;
+  a[18] = xr;
+  a[19] = xi;
+
+  xr = a[80];
+  xi = a[81];
+  yr = a[10];
+  yi = a[11];
+  a[80] = yr;
+  a[81] = yi;
+  a[10] = xr;
+  a[11] = xi;
+
+  xr = a[88];
+  xi = a[89];
+  yr = a[26];
+  yi = a[27];
+  a[88] = yr;
+  a[89] = yi;
+  a[26] = xr;
+  a[27] = xi;
+
+  xr = a[74];
+  xi = a[75];
+  yr = a[82];
+  yi = a[83];
+  a[74] = yr;
+  a[75] = yi;
+  a[82] = xr;
+  a[83] = xi;
+
+  xr = a[32];
+  xi = a[33];
+  yr = a[4];
+  yi = a[5];
+  a[32] = yr;
+  a[33] = yi;
+  a[4] = xr;
+  a[5] = xi;
+
+  xr = a[40];
+  xi = a[41];
+  yr = a[20];
+  yi = a[21];
+  a[40] = yr;
+  a[41] = yi;
+  a[20] = xr;
+  a[21] = xi;
+
+  xr = a[48];
+  xi = a[49];
+  yr = a[12];
+  yi = a[13];
+  a[48] = yr;
+  a[49] = yi;
+  a[12] = xr;
+  a[13] = xi;
+
+  xr = a[56];
+  xi = a[57];
+  yr = a[28];
+  yi = a[29];
+  a[56] = yr;
+  a[57] = yi;
+  a[28] = xr;
+  a[29] = xi;
+
+  xr = a[34];
+  xi = a[35];
+  yr = a[68];
+  yi = a[69];
+  a[34] = yr;
+  a[35] = yi;
+  a[68] = xr;
+  a[69] = xi;
+
+  xr = a[42];
+  xi = a[43];
+  yr = a[84];
+  yi = a[85];
+  a[42] = yr;
+  a[43] = yi;
+  a[84] = xr;
+  a[85] = xi;
+
+  xr = a[50];
+  xi = a[51];
+  yr = a[76];
+  yi = a[77];
+  a[50] = yr;
+  a[51] = yi;
+  a[76] = xr;
+  a[77] = xi;
+
+  xr = a[58];
+  xi = a[59];
+  yr = a[92];
+  yi = a[93];
+  a[58] = yr;
+  a[59] = yi;
+  a[92] = xr;
+  a[93] = xi;
+
+  xr = a[44];
+  xi = a[45];
+  yr = a[52];
+  yi = a[53];
+  a[44] = yr;
+  a[45] = yi;
+  a[52] = xr;
+  a[53] = xi;
+
+  xr = a[96];
+  xi = a[97];
+  yr = a[6];
+  yi = a[7];
+  a[96] = yr;
+  a[97] = yi;
+  a[6] = xr;
+  a[7] = xi;
+
+  xr = a[104];
+  xi = a[105];
+  yr = a[22];
+  yi = a[23];
+  a[104] = yr;
+  a[105] = yi;
+  a[22] = xr;
+  a[23] = xi;
+
+  xr = a[112];
+  xi = a[113];
+  yr = a[14];
+  yi = a[15];
+  a[112] = yr;
+  a[113] = yi;
+  a[14] = xr;
+  a[15] = xi;
+
+  xr = a[120];
+  xi = a[121];
+  yr = a[30];
+  yi = a[31];
+  a[120] = yr;
+  a[121] = yi;
+  a[30] = xr;
+  a[31] = xi;
+
+  xr = a[98];
+  xi = a[99];
+  yr = a[70];
+  yi = a[71];
+  a[98] = yr;
+  a[99] = yi;
+  a[70] = xr;
+  a[71] = xi;
+
+  xr = a[106];
+  xi = a[107];
+  yr = a[86];
+  yi = a[87];
+  a[106] = yr;
+  a[107] = yi;
+  a[86] = xr;
+  a[87] = xi;
+
+  xr = a[114];
+  xi = a[115];
+  yr = a[78];
+  yi = a[79];
+  a[114] = yr;
+  a[115] = yi;
+  a[78] = xr;
+  a[79] = xi;
+
+  xr = a[122];
+  xi = a[123];
+  yr = a[94];
+  yi = a[95];
+  a[122] = yr;
+  a[123] = yi;
+  a[94] = xr;
+  a[95] = xi;
+
+  xr = a[100];
+  xi = a[101];
+  yr = a[38];
+  yi = a[39];
+  a[100] = yr;
+  a[101] = yi;
+  a[38] = xr;
+  a[39] = xi;
+
+  xr = a[108];
+  xi = a[109];
+  yr = a[54];
+  yi = a[55];
+  a[108] = yr;
+  a[109] = yi;
+  a[54] = xr;
+  a[55] = xi;
+
+  xr = a[116];
+  xi = a[117];
+  yr = a[46];
+  yi = a[47];
+  a[116] = yr;
+  a[117] = yi;
+  a[46] = xr;
+  a[47] = xi;
+
+  xr = a[124];
+  xi = a[125];
+  yr = a[62];
+  yi = a[63];
+  a[124] = yr;
+  a[125] = yi;
+  a[62] = xr;
+  a[63] = xi;
+
+  xr = a[110];
+  xi = a[111];
+  yr = a[118];
+  yi = a[119];
+  a[110] = yr;
+  a[111] = yi;
+  a[118] = xr;
+  a[119] = xi;
+}
+
+static void cft1st_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14;
+  int a_ptr, p1_rdft, p2_rdft, count;
+  const float* first = rdft_wk3ri_first;
+  const float* second = rdft_wk3ri_second;
+
+  __asm __volatile (
+    ".set       push                                                    \n\t"
+    ".set       noreorder                                               \n\t"
+    // first 8
+    "lwc1       %[f0],        0(%[a])                                   \n\t"
+    "lwc1       %[f1],        4(%[a])                                   \n\t"
+    "lwc1       %[f2],        8(%[a])                                   \n\t"
+    "lwc1       %[f3],        12(%[a])                                  \n\t"
+    "lwc1       %[f4],        16(%[a])                                  \n\t"
+    "lwc1       %[f5],        20(%[a])                                  \n\t"
+    "lwc1       %[f6],        24(%[a])                                  \n\t"
+    "lwc1       %[f7],        28(%[a])                                  \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "add.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f8],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f2],        %[f1],        %[f4]                       \n\t"
+    "add.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "add.s      %[f4],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f6],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "swc1       %[f7],        0(%[a])                                   \n\t"
+    "swc1       %[f8],        16(%[a])                                  \n\t"
+    "swc1       %[f2],        28(%[a])                                  \n\t"
+    "swc1       %[f1],        12(%[a])                                  \n\t"
+    "swc1       %[f4],        4(%[a])                                   \n\t"
+    "swc1       %[f6],        20(%[a])                                  \n\t"
+    "swc1       %[f3],        8(%[a])                                   \n\t"
+    "swc1       %[f0],        24(%[a])                                  \n\t"
+    // second 8
+    "lwc1       %[f0],        32(%[a])                                  \n\t"
+    "lwc1       %[f1],        36(%[a])                                  \n\t"
+    "lwc1       %[f2],        40(%[a])                                  \n\t"
+    "lwc1       %[f3],        44(%[a])                                  \n\t"
+    "lwc1       %[f4],        48(%[a])                                  \n\t"
+    "lwc1       %[f5],        52(%[a])                                  \n\t"
+    "lwc1       %[f6],        56(%[a])                                  \n\t"
+    "lwc1       %[f7],        60(%[a])                                  \n\t"
+    "add.s      %[f8],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "add.s      %[f7],        %[f4],        %[f1]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f1]                       \n\t"
+    "add.s      %[f1],        %[f3],        %[f8]                       \n\t"
+    "sub.s      %[f3],        %[f3],        %[f8]                       \n\t"
+    "sub.s      %[f8],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f5],        %[f6],        %[f2]                       \n\t"
+    "sub.s      %[f6],        %[f2],        %[f6]                       \n\t"
+    "lwc1       %[f9],        8(%[rdft_w])                              \n\t"
+    "sub.s      %[f2],        %[f8],        %[f7]                       \n\t"
+    "add.s      %[f8],        %[f8],        %[f7]                       \n\t"
+    "sub.s      %[f7],        %[f4],        %[f0]                       \n\t"
+    "add.s      %[f4],        %[f4],        %[f0]                       \n\t"
+    // prepare for loop
+    "addiu      %[a_ptr],     %[a],         64                          \n\t"
+    "addiu      %[p1_rdft],   %[rdft_w],    8                           \n\t"
+    "addiu      %[p2_rdft],   %[rdft_w],    16                          \n\t"
+    "addiu      %[count],     $zero,        7                           \n\t"
+    // finish second 8
+    "mul.s      %[f2],        %[f9],        %[f2]                       \n\t"
+    "mul.s      %[f8],        %[f9],        %[f8]                       \n\t"
+    "mul.s      %[f7],        %[f9],        %[f7]                       \n\t"
+    "mul.s      %[f4],        %[f9],        %[f4]                       \n\t"
+    "swc1       %[f1],        32(%[a])                                  \n\t"
+    "swc1       %[f3],        52(%[a])                                  \n\t"
+    "swc1       %[f5],        36(%[a])                                  \n\t"
+    "swc1       %[f6],        48(%[a])                                  \n\t"
+    "swc1       %[f2],        40(%[a])                                  \n\t"
+    "swc1       %[f8],        44(%[a])                                  \n\t"
+    "swc1       %[f7],        56(%[a])                                  \n\t"
+    "swc1       %[f4],        60(%[a])                                  \n\t"
+    // loop
+   "1:                                                                  \n\t"
+    "lwc1       %[f0],        0(%[a_ptr])                               \n\t"
+    "lwc1       %[f1],        4(%[a_ptr])                               \n\t"
+    "lwc1       %[f2],        8(%[a_ptr])                               \n\t"
+    "lwc1       %[f3],        12(%[a_ptr])                              \n\t"
+    "lwc1       %[f4],        16(%[a_ptr])                              \n\t"
+    "lwc1       %[f5],        20(%[a_ptr])                              \n\t"
+    "lwc1       %[f6],        24(%[a_ptr])                              \n\t"
+    "lwc1       %[f7],        28(%[a_ptr])                              \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "lwc1       %[f10],       4(%[p1_rdft])                             \n\t"
+    "lwc1       %[f11],       0(%[p2_rdft])                             \n\t"
+    "lwc1       %[f12],       4(%[p2_rdft])                             \n\t"
+    "lwc1       %[f13],       8(%[first])                               \n\t"
+    "lwc1       %[f14],       12(%[first])                              \n\t"
+    "add.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f8],        %[f8],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f6],        %[f6],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f5],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "swc1       %[f7],        0(%[a_ptr])                               \n\t"
+    "swc1       %[f2],        4(%[a_ptr])                               \n\t"
+    "mul.s      %[f4],        %[f9],        %[f8]                       \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f8],        %[f10],       %[f8]                       \n\t"
+    "mul.s      %[f7],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f0],        %[f12],       %[f0]                       \n\t"
+    "mul.s      %[f2],        %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f3],        %[f14],       %[f3]                       \n\t"
+    "nmsub.s    %[f4],        %[f4],        %[f10],       %[f6]         \n\t"
+    "madd.s     %[f8],        %[f8],        %[f9],        %[f6]         \n\t"
+    "nmsub.s    %[f7],        %[f7],        %[f12],       %[f5]         \n\t"
+    "madd.s     %[f0],        %[f0],        %[f11],       %[f5]         \n\t"
+    "nmsub.s    %[f2],        %[f2],        %[f14],       %[f1]         \n\t"
+    "madd.s     %[f3],        %[f3],        %[f13],       %[f1]         \n\t"
+#else
+    "mul.s      %[f7],        %[f10],       %[f6]                       \n\t"
+    "mul.s      %[f6],        %[f9],        %[f6]                       \n\t"
+    "mul.s      %[f8],        %[f10],       %[f8]                       \n\t"
+    "mul.s      %[f2],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f11],       %[f11],       %[f5]                       \n\t"
+    "mul.s      %[f5],        %[f12],       %[f5]                       \n\t"
+    "mul.s      %[f0],        %[f12],       %[f0]                       \n\t"
+    "mul.s      %[f12],       %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f13],       %[f13],       %[f1]                       \n\t"
+    "mul.s      %[f1],        %[f14],       %[f1]                       \n\t"
+    "mul.s      %[f3],        %[f14],       %[f3]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f7]                       \n\t"
+    "add.s      %[f8],        %[f6],        %[f8]                       \n\t"
+    "sub.s      %[f7],        %[f2],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f11],       %[f0]                       \n\t"
+    "sub.s      %[f2],        %[f12],       %[f1]                       \n\t"
+    "add.s      %[f3],        %[f13],       %[f3]                       \n\t"
+#endif
+    "swc1       %[f4],        16(%[a_ptr])                              \n\t"
+    "swc1       %[f8],        20(%[a_ptr])                              \n\t"
+    "swc1       %[f7],        8(%[a_ptr])                               \n\t"
+    "swc1       %[f0],        12(%[a_ptr])                              \n\t"
+    "swc1       %[f2],        24(%[a_ptr])                              \n\t"
+    "swc1       %[f3],        28(%[a_ptr])                              \n\t"
+    "lwc1       %[f0],        32(%[a_ptr])                              \n\t"
+    "lwc1       %[f1],        36(%[a_ptr])                              \n\t"
+    "lwc1       %[f2],        40(%[a_ptr])                              \n\t"
+    "lwc1       %[f3],        44(%[a_ptr])                              \n\t"
+    "lwc1       %[f4],        48(%[a_ptr])                              \n\t"
+    "lwc1       %[f5],        52(%[a_ptr])                              \n\t"
+    "lwc1       %[f6],        56(%[a_ptr])                              \n\t"
+    "lwc1       %[f7],        60(%[a_ptr])                              \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "lwc1       %[f11],       8(%[p2_rdft])                             \n\t"
+    "lwc1       %[f12],       12(%[p2_rdft])                            \n\t"
+    "lwc1       %[f13],       8(%[second])                              \n\t"
+    "lwc1       %[f14],       12(%[second])                             \n\t"
+    "add.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "sub.s      %[f8],        %[f2],        %[f8]                       \n\t"
+    "add.s      %[f2],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f6],        %[f3],        %[f6]                       \n\t"
+    "add.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f5],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "swc1       %[f7],        32(%[a_ptr])                              \n\t"
+    "swc1       %[f2],        36(%[a_ptr])                              \n\t"
+    "mul.s      %[f4],        %[f10],       %[f8]                       \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f10],       %[f10],       %[f6]                       \n\t"
+    "mul.s      %[f7],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f11],       %[f11],       %[f5]                       \n\t"
+    "mul.s      %[f2],        %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f13],       %[f13],       %[f1]                       \n\t"
+    "madd.s     %[f4],        %[f4],        %[f9],        %[f6]         \n\t"
+    "nmsub.s    %[f10],       %[f10],       %[f9],        %[f8]         \n\t"
+    "nmsub.s    %[f7],        %[f7],        %[f12],       %[f5]         \n\t"
+    "madd.s     %[f11],       %[f11],       %[f12],       %[f0]         \n\t"
+    "nmsub.s    %[f2],        %[f2],        %[f14],       %[f1]         \n\t"
+    "madd.s     %[f13],       %[f13],       %[f14],       %[f3]         \n\t"
+#else
+    "mul.s      %[f2],        %[f9],        %[f6]                       \n\t"
+    "mul.s      %[f10],       %[f10],       %[f6]                       \n\t"
+    "mul.s      %[f9],        %[f9],        %[f8]                       \n\t"
+    "mul.s      %[f7],        %[f11],       %[f0]                       \n\t"
+    "mul.s      %[f8],        %[f12],       %[f5]                       \n\t"
+    "mul.s      %[f11],       %[f11],       %[f5]                       \n\t"
+    "mul.s      %[f12],       %[f12],       %[f0]                       \n\t"
+    "mul.s      %[f5],        %[f13],       %[f3]                       \n\t"
+    "mul.s      %[f0],        %[f14],       %[f1]                       \n\t"
+    "mul.s      %[f13],       %[f13],       %[f1]                       \n\t"
+    "mul.s      %[f14],       %[f14],       %[f3]                       \n\t"
+    "add.s      %[f4],        %[f4],        %[f2]                       \n\t"
+    "sub.s      %[f10],       %[f10],       %[f9]                       \n\t"
+    "sub.s      %[f7],        %[f7],        %[f8]                       \n\t"
+    "add.s      %[f11],       %[f11],       %[f12]                      \n\t"
+    "sub.s      %[f2],        %[f5],        %[f0]                       \n\t"
+    "add.s      %[f13],       %[f13],       %[f14]                      \n\t"
+#endif
+    "swc1       %[f4],        48(%[a_ptr])                              \n\t"
+    "swc1       %[f10],       52(%[a_ptr])                              \n\t"
+    "swc1       %[f7],        40(%[a_ptr])                              \n\t"
+    "swc1       %[f11],       44(%[a_ptr])                              \n\t"
+    "swc1       %[f2],        56(%[a_ptr])                              \n\t"
+    "swc1       %[f13],       60(%[a_ptr])                              \n\t"
+    "addiu      %[count],     %[count],     -1                          \n\t"
+    "lwc1       %[f9],        8(%[p1_rdft])                             \n\t"
+    "addiu      %[a_ptr],     %[a_ptr],     64                          \n\t"
+    "addiu      %[p1_rdft],   %[p1_rdft],   8                           \n\t"
+    "addiu      %[p2_rdft],   %[p2_rdft],   16                          \n\t"
+    "addiu      %[first],     %[first],     8                           \n\t"
+    "bgtz       %[count],     1b                                        \n\t"
+    " addiu     %[second],    %[second],    8                           \n\t"
+    ".set       pop                                                     \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
+      [f12] "=&f" (f12), [f13] "=&f" (f13), [f14] "=&f" (f14),
+      [a_ptr] "=&r" (a_ptr), [p1_rdft] "=&r" (p1_rdft), [first] "+r" (first),
+      [p2_rdft] "=&r" (p2_rdft), [count] "=&r" (count), [second] "+r" (second)
+    : [a] "r" (a), [rdft_w] "r" (rdft_w)
+    : "memory"
+  );
+}
+
+static void cftmdl_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14;
+  int tmp_a, count;
+  __asm __volatile (
+    ".set       push                                      \n\t"
+    ".set       noreorder                                 \n\t"
+    "addiu      %[tmp_a],   %[a],         0               \n\t"
+    "addiu      %[count],   $zero,        4               \n\t"
+   "1:                                                    \n\t"
+    "addiu      %[count],   %[count],     -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                   \n\t"
+    "lwc1       %[f2],      32(%[tmp_a])                  \n\t"
+    "lwc1       %[f4],      64(%[tmp_a])                  \n\t"
+    "lwc1       %[f6],      96(%[tmp_a])                  \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                   \n\t"
+    "lwc1       %[f3],      36(%[tmp_a])                  \n\t"
+    "lwc1       %[f5],      68(%[tmp_a])                  \n\t"
+    "lwc1       %[f7],      100(%[tmp_a])                 \n\t"
+    "add.s      %[f8],      %[f0],        %[f2]           \n\t"
+    "sub.s      %[f0],      %[f0],        %[f2]           \n\t"
+    "add.s      %[f2],      %[f4],        %[f6]           \n\t"
+    "sub.s      %[f4],      %[f4],        %[f6]           \n\t"
+    "add.s      %[f6],      %[f1],        %[f3]           \n\t"
+    "sub.s      %[f1],      %[f1],        %[f3]           \n\t"
+    "add.s      %[f3],      %[f5],        %[f7]           \n\t"
+    "sub.s      %[f5],      %[f5],        %[f7]           \n\t"
+    "add.s      %[f7],      %[f8],        %[f2]           \n\t"
+    "sub.s      %[f8],      %[f8],        %[f2]           \n\t"
+    "add.s      %[f2],      %[f1],        %[f4]           \n\t"
+    "sub.s      %[f1],      %[f1],        %[f4]           \n\t"
+    "add.s      %[f4],      %[f6],        %[f3]           \n\t"
+    "sub.s      %[f6],      %[f6],        %[f3]           \n\t"
+    "sub.s      %[f3],      %[f0],        %[f5]           \n\t"
+    "add.s      %[f0],      %[f0],        %[f5]           \n\t"
+    "swc1       %[f7],      0(%[tmp_a])                   \n\t"
+    "swc1       %[f8],      64(%[tmp_a])                  \n\t"
+    "swc1       %[f2],      36(%[tmp_a])                  \n\t"
+    "swc1       %[f1],      100(%[tmp_a])                 \n\t"
+    "swc1       %[f4],      4(%[tmp_a])                   \n\t"
+    "swc1       %[f6],      68(%[tmp_a])                  \n\t"
+    "swc1       %[f3],      32(%[tmp_a])                  \n\t"
+    "swc1       %[f0],      96(%[tmp_a])                  \n\t"
+    "bgtz       %[count],   1b                            \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],     8               \n\t"
+    ".set       pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+  f9 = rdft_w[2];
+  __asm __volatile (
+    ".set       push                                      \n\t"
+    ".set       noreorder                                 \n\t"
+    "addiu      %[tmp_a],   %[a],         128             \n\t"
+    "addiu      %[count],   $zero,        4               \n\t"
+   "1:                                                    \n\t"
+    "addiu      %[count],   %[count],     -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                   \n\t"
+    "lwc1       %[f2],      32(%[tmp_a])                  \n\t"
+    "lwc1       %[f5],      68(%[tmp_a])                  \n\t"
+    "lwc1       %[f7],      100(%[tmp_a])                 \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                   \n\t"
+    "lwc1       %[f3],      36(%[tmp_a])                  \n\t"
+    "lwc1       %[f4],      64(%[tmp_a])                  \n\t"
+    "lwc1       %[f6],      96(%[tmp_a])                  \n\t"
+    "sub.s      %[f8],      %[f0],        %[f2]           \n\t"
+    "add.s      %[f0],      %[f0],        %[f2]           \n\t"
+    "sub.s      %[f2],      %[f5],        %[f7]           \n\t"
+    "add.s      %[f5],      %[f5],        %[f7]           \n\t"
+    "sub.s      %[f7],      %[f1],        %[f3]           \n\t"
+    "add.s      %[f1],      %[f1],        %[f3]           \n\t"
+    "sub.s      %[f3],      %[f4],        %[f6]           \n\t"
+    "add.s      %[f4],      %[f4],        %[f6]           \n\t"
+    "sub.s      %[f6],      %[f8],        %[f2]           \n\t"
+    "add.s      %[f8],      %[f8],        %[f2]           \n\t"
+    "add.s      %[f2],      %[f5],        %[f1]           \n\t"
+    "sub.s      %[f5],      %[f5],        %[f1]           \n\t"
+    "add.s      %[f1],      %[f3],        %[f7]           \n\t"
+    "sub.s      %[f3],      %[f3],        %[f7]           \n\t"
+    "add.s      %[f7],      %[f0],        %[f4]           \n\t"
+    "sub.s      %[f0],      %[f0],        %[f4]           \n\t"
+    "sub.s      %[f4],      %[f6],        %[f1]           \n\t"
+    "add.s      %[f6],      %[f6],        %[f1]           \n\t"
+    "sub.s      %[f1],      %[f3],        %[f8]           \n\t"
+    "add.s      %[f3],      %[f3],        %[f8]           \n\t"
+    "mul.s      %[f4],      %[f4],        %[f9]           \n\t"
+    "mul.s      %[f6],      %[f6],        %[f9]           \n\t"
+    "mul.s      %[f1],      %[f1],        %[f9]           \n\t"
+    "mul.s      %[f3],      %[f3],        %[f9]           \n\t"
+    "swc1       %[f7],      0(%[tmp_a])                   \n\t"
+    "swc1       %[f2],      4(%[tmp_a])                   \n\t"
+    "swc1       %[f5],      64(%[tmp_a])                  \n\t"
+    "swc1       %[f0],      68(%[tmp_a])                  \n\t"
+    "swc1       %[f4],      32(%[tmp_a])                  \n\t"
+    "swc1       %[f6],      36(%[tmp_a])                  \n\t"
+    "swc1       %[f1],      96(%[tmp_a])                  \n\t"
+    "swc1       %[f3],      100(%[tmp_a])                 \n\t"
+    "bgtz       %[count],   1b                            \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],     8               \n\t"
+    ".set       pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a), [f9] "f" (f9)
+    : "memory"
+  );
+  f10 = rdft_w[3];
+  f11 = rdft_w[4];
+  f12 = rdft_w[5];
+  f13 = rdft_wk3ri_first[2];
+  f14 = rdft_wk3ri_first[3];
+
+  __asm __volatile (
+    ".set       push                                                    \n\t"
+    ".set       noreorder                                               \n\t"
+    "addiu      %[tmp_a],     %[a],         256                         \n\t"
+    "addiu      %[count],     $zero,        4                           \n\t"
+   "1:                                                                  \n\t"
+    "addiu      %[count],     %[count],     -1                          \n\t"
+    "lwc1       %[f0],        0(%[tmp_a])                               \n\t"
+    "lwc1       %[f2],        32(%[tmp_a])                              \n\t"
+    "lwc1       %[f4],        64(%[tmp_a])                              \n\t"
+    "lwc1       %[f6],        96(%[tmp_a])                              \n\t"
+    "lwc1       %[f1],        4(%[tmp_a])                               \n\t"
+    "lwc1       %[f3],        36(%[tmp_a])                              \n\t"
+    "lwc1       %[f5],        68(%[tmp_a])                              \n\t"
+    "lwc1       %[f7],        100(%[tmp_a])                             \n\t"
+    "add.s      %[f8],        %[f0],        %[f2]                       \n\t"
+    "sub.s      %[f0],        %[f0],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f4],        %[f6]                       \n\t"
+    "sub.s      %[f4],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f6],        %[f1],        %[f3]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f3]                       \n\t"
+    "add.s      %[f3],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f7]                       \n\t"
+    "sub.s      %[f7],        %[f8],        %[f2]                       \n\t"
+    "add.s      %[f8],        %[f8],        %[f2]                       \n\t"
+    "add.s      %[f2],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f1],        %[f1],        %[f4]                       \n\t"
+    "sub.s      %[f4],        %[f6],        %[f3]                       \n\t"
+    "add.s      %[f6],        %[f6],        %[f3]                       \n\t"
+    "sub.s      %[f3],        %[f0],        %[f5]                       \n\t"
+    "add.s      %[f0],        %[f0],        %[f5]                       \n\t"
+    "swc1       %[f8],        0(%[tmp_a])                               \n\t"
+    "swc1       %[f6],        4(%[tmp_a])                               \n\t"
+    "mul.s      %[f5],        %[f9],        %[f7]                       \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f7],        %[f10],       %[f7]                       \n\t"
+    "mul.s      %[f8],        %[f11],       %[f3]                       \n\t"
+    "mul.s      %[f3],        %[f12],       %[f3]                       \n\t"
+    "mul.s      %[f6],        %[f13],       %[f0]                       \n\t"
+    "mul.s      %[f0],        %[f14],       %[f0]                       \n\t"
+    "nmsub.s    %[f5],        %[f5],        %[f10],       %[f4]         \n\t"
+    "madd.s     %[f7],        %[f7],        %[f9],        %[f4]         \n\t"
+    "nmsub.s    %[f8],        %[f8],        %[f12],       %[f2]         \n\t"
+    "madd.s     %[f3],        %[f3],        %[f11],       %[f2]         \n\t"
+    "nmsub.s    %[f6],        %[f6],        %[f14],       %[f1]         \n\t"
+    "madd.s     %[f0],        %[f0],        %[f13],       %[f1]         \n\t"
+    "swc1       %[f5],        64(%[tmp_a])                              \n\t"
+    "swc1       %[f7],        68(%[tmp_a])                              \n\t"
+#else
+    "mul.s      %[f8],        %[f10],       %[f4]                       \n\t"
+    "mul.s      %[f4],        %[f9],        %[f4]                       \n\t"
+    "mul.s      %[f7],        %[f10],       %[f7]                       \n\t"
+    "mul.s      %[f6],        %[f11],       %[f3]                       \n\t"
+    "mul.s      %[f3],        %[f12],       %[f3]                       \n\t"
+    "sub.s      %[f5],        %[f5],        %[f8]                       \n\t"
+    "mul.s      %[f8],        %[f12],       %[f2]                       \n\t"
+    "mul.s      %[f2],        %[f11],       %[f2]                       \n\t"
+    "add.s      %[f7],        %[f4],        %[f7]                       \n\t"
+    "mul.s      %[f4],        %[f13],       %[f0]                       \n\t"
+    "mul.s      %[f0],        %[f14],       %[f0]                       \n\t"
+    "sub.s      %[f8],        %[f6],        %[f8]                       \n\t"
+    "mul.s      %[f6],        %[f14],       %[f1]                       \n\t"
+    "mul.s      %[f1],        %[f13],       %[f1]                       \n\t"
+    "add.s      %[f3],        %[f2],        %[f3]                       \n\t"
+    "swc1       %[f5],        64(%[tmp_a])                              \n\t"
+    "swc1       %[f7],        68(%[tmp_a])                              \n\t"
+    "sub.s      %[f6],        %[f4],        %[f6]                       \n\t"
+    "add.s      %[f0],        %[f1],        %[f0]                       \n\t"
+#endif
+    "swc1       %[f8],        32(%[tmp_a])                              \n\t"
+    "swc1       %[f3],        36(%[tmp_a])                              \n\t"
+    "swc1       %[f6],        96(%[tmp_a])                              \n\t"
+    "swc1       %[f0],        100(%[tmp_a])                             \n\t"
+    "bgtz       %[count],     1b                                        \n\t"
+    " addiu     %[tmp_a],     %[tmp_a],     8                           \n\t"
+    ".set       pop                                                     \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a),  [f9] "f" (f9), [f10] "f" (f10), [f11] "f" (f11),
+      [f12] "f" (f12), [f13] "f" (f13), [f14] "f" (f14)
+    : "memory"
+  );
+  f11 = rdft_w[6];
+  f12 = rdft_w[7];
+  f13 = rdft_wk3ri_second[2];
+  f14 = rdft_wk3ri_second[3];
+  __asm __volatile (
+    ".set       push                                                       \n\t"
+    ".set       noreorder                                                  \n\t"
+    "addiu      %[tmp_a],       %[a],           384                        \n\t"
+    "addiu      %[count],       $zero,          4                          \n\t"
+   "1:                                                                     \n\t"
+    "addiu      %[count],       %[count],       -1                         \n\t"
+    "lwc1       %[f0],          0(%[tmp_a])                                \n\t"
+    "lwc1       %[f1],          4(%[tmp_a])                                \n\t"
+    "lwc1       %[f2],          32(%[tmp_a])                               \n\t"
+    "lwc1       %[f3],          36(%[tmp_a])                               \n\t"
+    "lwc1       %[f4],          64(%[tmp_a])                               \n\t"
+    "lwc1       %[f5],          68(%[tmp_a])                               \n\t"
+    "lwc1       %[f6],          96(%[tmp_a])                               \n\t"
+    "lwc1       %[f7],          100(%[tmp_a])                              \n\t"
+    "add.s      %[f8],          %[f0],          %[f2]                      \n\t"
+    "sub.s      %[f0],          %[f0],          %[f2]                      \n\t"
+    "add.s      %[f2],          %[f4],          %[f6]                      \n\t"
+    "sub.s      %[f4],          %[f4],          %[f6]                      \n\t"
+    "add.s      %[f6],          %[f1],          %[f3]                      \n\t"
+    "sub.s      %[f1],          %[f1],          %[f3]                      \n\t"
+    "add.s      %[f3],          %[f5],          %[f7]                      \n\t"
+    "sub.s      %[f5],          %[f5],          %[f7]                      \n\t"
+    "sub.s      %[f7],          %[f2],          %[f8]                      \n\t"
+    "add.s      %[f2],          %[f2],          %[f8]                      \n\t"
+    "add.s      %[f8],          %[f1],          %[f4]                      \n\t"
+    "sub.s      %[f1],          %[f1],          %[f4]                      \n\t"
+    "sub.s      %[f4],          %[f3],          %[f6]                      \n\t"
+    "add.s      %[f3],          %[f3],          %[f6]                      \n\t"
+    "sub.s      %[f6],          %[f0],          %[f5]                      \n\t"
+    "add.s      %[f0],          %[f0],          %[f5]                      \n\t"
+    "swc1       %[f2],          0(%[tmp_a])                                \n\t"
+    "swc1       %[f3],          4(%[tmp_a])                                \n\t"
+    "mul.s      %[f5],          %[f10],         %[f7]                      \n\t"
+#if defined(MIPS32_R2_LE)
+    "mul.s      %[f7],          %[f9],          %[f7]                      \n\t"
+    "mul.s      %[f2],          %[f12],         %[f8]                      \n\t"
+    "mul.s      %[f8],          %[f11],         %[f8]                      \n\t"
+    "mul.s      %[f3],          %[f14],         %[f1]                      \n\t"
+    "mul.s      %[f1],          %[f13],         %[f1]                      \n\t"
+    "madd.s     %[f5],          %[f5],          %[f9],       %[f4]         \n\t"
+    "msub.s     %[f7],          %[f7],          %[f10],      %[f4]         \n\t"
+    "msub.s     %[f2],          %[f2],          %[f11],      %[f6]         \n\t"
+    "madd.s     %[f8],          %[f8],          %[f12],      %[f6]         \n\t"
+    "msub.s     %[f3],          %[f3],          %[f13],      %[f0]         \n\t"
+    "madd.s     %[f1],          %[f1],          %[f14],      %[f0]         \n\t"
+    "swc1       %[f5],          64(%[tmp_a])                               \n\t"
+    "swc1       %[f7],          68(%[tmp_a])                               \n\t"
+#else
+    "mul.s      %[f2],          %[f9],          %[f4]                      \n\t"
+    "mul.s      %[f4],          %[f10],         %[f4]                      \n\t"
+    "mul.s      %[f7],          %[f9],          %[f7]                      \n\t"
+    "mul.s      %[f3],          %[f11],         %[f6]                      \n\t"
+    "mul.s      %[f6],          %[f12],         %[f6]                      \n\t"
+    "add.s      %[f5],          %[f5],          %[f2]                      \n\t"
+    "sub.s      %[f7],          %[f4],          %[f7]                      \n\t"
+    "mul.s      %[f2],          %[f12],         %[f8]                      \n\t"
+    "mul.s      %[f8],          %[f11],         %[f8]                      \n\t"
+    "mul.s      %[f4],          %[f14],         %[f1]                      \n\t"
+    "mul.s      %[f1],          %[f13],         %[f1]                      \n\t"
+    "sub.s      %[f2],          %[f3],          %[f2]                      \n\t"
+    "mul.s      %[f3],          %[f13],         %[f0]                      \n\t"
+    "mul.s      %[f0],          %[f14],         %[f0]                      \n\t"
+    "add.s      %[f8],          %[f8],          %[f6]                      \n\t"
+    "swc1       %[f5],          64(%[tmp_a])                               \n\t"
+    "swc1       %[f7],          68(%[tmp_a])                               \n\t"
+    "sub.s      %[f3],          %[f3],          %[f4]                      \n\t"
+    "add.s      %[f1],          %[f1],          %[f0]                      \n\t"
+#endif
+    "swc1       %[f2],          32(%[tmp_a])                               \n\t"
+    "swc1       %[f8],          36(%[tmp_a])                               \n\t"
+    "swc1       %[f3],          96(%[tmp_a])                               \n\t"
+    "swc1       %[f1],          100(%[tmp_a])                              \n\t"
+    "bgtz       %[count],       1b                                         \n\t"
+    " addiu     %[tmp_a],       %[tmp_a],       8                          \n\t"
+    ".set       pop                                                        \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a), [f9] "f" (f9), [f10] "f" (f10), [f11] "f" (f11),
+      [f12] "f" (f12), [f13] "f" (f13), [f14] "f" (f14)
+    : "memory"
+  );
+}
+
+static void cftfsub_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8;
+  int tmp_a, count;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+
+  __asm __volatile (
+    ".set       push                                      \n\t"
+    ".set       noreorder                                 \n\t"
+    "addiu      %[tmp_a],       %[a],         0           \n\t"
+    "addiu      %[count],       $zero,        16          \n\t"
+   "1:                                                    \n\t"
+    "addiu      %[count],       %[count],     -1          \n\t"
+    "lwc1       %[f0],          0(%[tmp_a])               \n\t"
+    "lwc1       %[f2],          128(%[tmp_a])             \n\t"
+    "lwc1       %[f4],          256(%[tmp_a])             \n\t"
+    "lwc1       %[f6],          384(%[tmp_a])             \n\t"
+    "lwc1       %[f1],          4(%[tmp_a])               \n\t"
+    "lwc1       %[f3],          132(%[tmp_a])             \n\t"
+    "lwc1       %[f5],          260(%[tmp_a])             \n\t"
+    "lwc1       %[f7],          388(%[tmp_a])             \n\t"
+    "add.s      %[f8],          %[f0],        %[f2]       \n\t"
+    "sub.s      %[f0],          %[f0],        %[f2]       \n\t"
+    "add.s      %[f2],          %[f4],        %[f6]       \n\t"
+    "sub.s      %[f4],          %[f4],        %[f6]       \n\t"
+    "add.s      %[f6],          %[f1],        %[f3]       \n\t"
+    "sub.s      %[f1],          %[f1],        %[f3]       \n\t"
+    "add.s      %[f3],          %[f5],        %[f7]       \n\t"
+    "sub.s      %[f5],          %[f5],        %[f7]       \n\t"
+    "add.s      %[f7],          %[f8],        %[f2]       \n\t"
+    "sub.s      %[f8],          %[f8],        %[f2]       \n\t"
+    "add.s      %[f2],          %[f1],        %[f4]       \n\t"
+    "sub.s      %[f1],          %[f1],        %[f4]       \n\t"
+    "add.s      %[f4],          %[f6],        %[f3]       \n\t"
+    "sub.s      %[f6],          %[f6],        %[f3]       \n\t"
+    "sub.s      %[f3],          %[f0],        %[f5]       \n\t"
+    "add.s      %[f0],          %[f0],        %[f5]       \n\t"
+    "swc1       %[f7],          0(%[tmp_a])               \n\t"
+    "swc1       %[f8],          256(%[tmp_a])             \n\t"
+    "swc1       %[f2],          132(%[tmp_a])             \n\t"
+    "swc1       %[f1],          388(%[tmp_a])             \n\t"
+    "swc1       %[f4],          4(%[tmp_a])               \n\t"
+    "swc1       %[f6],          260(%[tmp_a])             \n\t"
+    "swc1       %[f3],          128(%[tmp_a])             \n\t"
+    "swc1       %[f0],          384(%[tmp_a])             \n\t"
+    "bgtz       %[count],       1b                        \n\t"
+    " addiu     %[tmp_a],       %[tmp_a],   8             \n\t"
+    ".set       pop                                       \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a),
+      [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+}
+
+static void cftbsub_128_mips(float* a) {
+  float f0, f1, f2, f3, f4, f5, f6, f7, f8;
+  int tmp_a, count;
+
+  cft1st_128(a);
+  cftmdl_128(a);
+
+  __asm __volatile (
+    ".set       push                                        \n\t"
+    ".set       noreorder                                   \n\t"
+    "addiu      %[tmp_a],   %[a],           0               \n\t"
+    "addiu      %[count],   $zero,          16              \n\t"
+   "1:                                                      \n\t"
+    "addiu      %[count],   %[count],       -1              \n\t"
+    "lwc1       %[f0],      0(%[tmp_a])                     \n\t"
+    "lwc1       %[f2],      128(%[tmp_a])                   \n\t"
+    "lwc1       %[f4],      256(%[tmp_a])                   \n\t"
+    "lwc1       %[f6],      384(%[tmp_a])                   \n\t"
+    "lwc1       %[f1],      4(%[tmp_a])                     \n\t"
+    "lwc1       %[f3],      132(%[tmp_a])                   \n\t"
+    "lwc1       %[f5],      260(%[tmp_a])                   \n\t"
+    "lwc1       %[f7],      388(%[tmp_a])                   \n\t"
+    "add.s      %[f8],      %[f0],          %[f2]           \n\t"
+    "sub.s      %[f0],      %[f0],          %[f2]           \n\t"
+    "add.s      %[f2],      %[f4],          %[f6]           \n\t"
+    "sub.s      %[f4],      %[f4],          %[f6]           \n\t"
+    "add.s      %[f6],      %[f1],          %[f3]           \n\t"
+    "sub.s      %[f1],      %[f3],          %[f1]           \n\t"
+    "add.s      %[f3],      %[f5],          %[f7]           \n\t"
+    "sub.s      %[f5],      %[f5],          %[f7]           \n\t"
+    "add.s      %[f7],      %[f8],          %[f2]           \n\t"
+    "sub.s      %[f8],      %[f8],          %[f2]           \n\t"
+    "sub.s      %[f2],      %[f1],          %[f4]           \n\t"
+    "add.s      %[f1],      %[f1],          %[f4]           \n\t"
+    "add.s      %[f4],      %[f3],          %[f6]           \n\t"
+    "sub.s      %[f6],      %[f3],          %[f6]           \n\t"
+    "sub.s      %[f3],      %[f0],          %[f5]           \n\t"
+    "add.s      %[f0],      %[f0],          %[f5]           \n\t"
+    "neg.s      %[f4],      %[f4]                           \n\t"
+    "swc1       %[f7],      0(%[tmp_a])                     \n\t"
+    "swc1       %[f8],      256(%[tmp_a])                   \n\t"
+    "swc1       %[f2],      132(%[tmp_a])                   \n\t"
+    "swc1       %[f1],      388(%[tmp_a])                   \n\t"
+    "swc1       %[f6],      260(%[tmp_a])                   \n\t"
+    "swc1       %[f3],      128(%[tmp_a])                   \n\t"
+    "swc1       %[f0],      384(%[tmp_a])                   \n\t"
+    "swc1       %[f4],       4(%[tmp_a])                     \n\t"
+    "bgtz       %[count],   1b                              \n\t"
+    " addiu     %[tmp_a],   %[tmp_a],       8               \n\t"
+    ".set       pop                                         \n\t"
+    : [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3),
+      [f4] "=&f" (f4), [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7),
+      [f8] "=&f" (f8), [tmp_a] "=&r" (tmp_a), [count] "=&r" (count)
+    : [a] "r" (a)
+    : "memory"
+  );
+}
+
+static void rftfsub_128_mips(float* a) {
+  const float* c = rdft_w + 32;
+  const float f0 = 0.5f;
+  float* a1 = &a[2];
+  float* a2 = &a[126];
+  const float* c1 = &c[1];
+  const float* c2 = &c[31];
+  float f1, f2, f3 ,f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
+  int count;
+
+  __asm __volatile (
+    ".set      push                                             \n\t"
+    ".set      noreorder                                        \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "addiu     %[count],    $zero,        15                    \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "sub.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "add.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "nmsub.s   %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "madd.s    %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f3],        %[f6]                 \n\t"
+    "sub.s     %[f4],       %[f4],        %[f6]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "addiu     %[a1],       %[a1],        8                     \n\t"
+    "addiu     %[a2],       %[a2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        4                     \n\t"
+    "addiu     %[c2],       %[c2],        -4                    \n\t"
+   "1:                                                          \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "lwc1      %[f10],      -4(%[c2])                           \n\t"
+    "lwc1      %[f11],      8(%[a1])                            \n\t"
+    "lwc1      %[f12],      -8(%[a2])                           \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "sub.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "add.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "nmsub.s   %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "madd.s    %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f10],      %[f0],        %[f10]                \n\t"
+    "sub.s     %[f5],       %[f11],       %[f12]                \n\t"
+    "add.s     %[f7],       %[f13],       %[f14]                \n\t"
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f3],        %[f6]                 \n\t"
+    "mul.s     %[f8],       %[f10],       %[f5]                 \n\t"
+    "mul.s     %[f10],      %[f10],       %[f7]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f9],       %[f15],       %[f7]                 \n\t"
+    "mul.s     %[f15],      %[f15],       %[f5]                 \n\t"
+    "sub.s     %[f4],       %[f4],        %[f6]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "sub.s     %[f8],       %[f8],        %[f9]                 \n\t"
+    "add.s     %[f10],      %[f10],       %[f15]                \n\t"
+#else
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "sub.s     %[f4],       %[f4],        %[f6]                 \n\t"
+    "nmsub.s   %[f8],       %[f8],        %[f15],     %[f7]     \n\t"
+    "madd.s    %[f10],      %[f10],       %[f15],     %[f5]     \n\t"
+#endif
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "sub.s     %[f11],      %[f11],       %[f8]                 \n\t"
+    "add.s     %[f12],      %[f12],       %[f8]                 \n\t"
+    "sub.s     %[f13],      %[f13],       %[f10]                \n\t"
+    "sub.s     %[f14],      %[f14],       %[f10]                \n\t"
+    "addiu     %[c2],       %[c2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        8                     \n\t"
+    "swc1      %[f11],      8(%[a1])                            \n\t"
+    "swc1      %[f12],      -8(%[a2])                           \n\t"
+    "swc1      %[f13],      12(%[a1])                           \n\t"
+    "swc1      %[f14],      -4(%[a2])                           \n\t"
+    "addiu     %[a1],       %[a1],        16                    \n\t"
+    "addiu     %[count],    %[count],     -1                    \n\t"
+    "bgtz      %[count],    1b                                  \n\t"
+    " addiu    %[a2],       %[a2],        -16                   \n\t"
+    ".set      pop                                              \n\t"
+    : [a1] "+r" (a1), [a2] "+r" (a2), [c1] "+r" (c1), [c2] "+r" (c2),
+      [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), [f4] "=&f" (f4),
+      [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+      [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11), [f12] "=&f" (f12),
+      [f13] "=&f" (f13), [f14] "=&f" (f14), [f15] "=&f" (f15),
+      [count] "=&r" (count)
+    : [f0] "f" (f0)
+    : "memory"
+  );
+}
+
+static void rftbsub_128_mips(float* a) {
+  const float *c = rdft_w + 32;
+  const float f0 = 0.5f;
+  float* a1 = &a[2];
+  float* a2 = &a[126];
+  const float* c1 = &c[1];
+  const float* c2 = &c[31];
+  float f1, f2, f3 ,f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
+  int count;
+
+  a[1] = -a[1];
+  a[65] = -a[65];
+
+  __asm __volatile (
+    ".set      push                                             \n\t"
+    ".set      noreorder                                        \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "addiu     %[count],    $zero,        15                    \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "add.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "sub.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "madd.s    %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "nmsub.s   %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f6],        %[f3]                 \n\t"
+    "sub.s     %[f4],       %[f6],        %[f4]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "addiu     %[a1],       %[a1],        8                     \n\t"
+    "addiu     %[a2],       %[a2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        4                     \n\t"
+    "addiu     %[c2],       %[c2],        -4                    \n\t"
+   "1:                                                          \n\t"
+    "lwc1      %[f6],       0(%[c2])                            \n\t"
+    "lwc1      %[f1],       0(%[a1])                            \n\t"
+    "lwc1      %[f2],       0(%[a2])                            \n\t"
+    "lwc1      %[f3],       4(%[a1])                            \n\t"
+    "lwc1      %[f4],       4(%[a2])                            \n\t"
+    "lwc1      %[f5],       0(%[c1])                            \n\t"
+    "sub.s     %[f6],       %[f0],        %[f6]                 \n\t"
+    "sub.s     %[f7],       %[f1],        %[f2]                 \n\t"
+    "add.s     %[f8],       %[f3],        %[f4]                 \n\t"
+    "lwc1      %[f10],      -4(%[c2])                           \n\t"
+    "lwc1      %[f11],      8(%[a1])                            \n\t"
+    "lwc1      %[f12],      -8(%[a2])                           \n\t"
+    "mul.s     %[f9],       %[f6],        %[f7]                 \n\t"
+    "mul.s     %[f6],       %[f6],        %[f8]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f8],       %[f5],        %[f8]                 \n\t"
+    "mul.s     %[f5],       %[f5],        %[f7]                 \n\t"
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "add.s     %[f9],       %[f9],        %[f8]                 \n\t"
+    "sub.s     %[f6],       %[f6],        %[f5]                 \n\t"
+#else
+    "lwc1      %[f13],      12(%[a1])                           \n\t"
+    "lwc1      %[f14],      -4(%[a2])                           \n\t"
+    "lwc1      %[f15],      4(%[c1])                            \n\t"
+    "madd.s    %[f9],       %[f9],        %[f5],      %[f8]     \n\t"
+    "nmsub.s   %[f6],       %[f6],        %[f5],      %[f7]     \n\t"
+#endif
+    "sub.s     %[f10],      %[f0],        %[f10]                \n\t"
+    "sub.s     %[f5],       %[f11],       %[f12]                \n\t"
+    "add.s     %[f7],       %[f13],       %[f14]                \n\t"
+    "sub.s     %[f1],       %[f1],        %[f9]                 \n\t"
+    "add.s     %[f2],       %[f2],        %[f9]                 \n\t"
+    "sub.s     %[f3],       %[f6],        %[f3]                 \n\t"
+    "mul.s     %[f8],       %[f10],       %[f5]                 \n\t"
+    "mul.s     %[f10],      %[f10],       %[f7]                 \n\t"
+#if !defined(MIPS32_R2_LE)
+    "mul.s     %[f9],       %[f15],       %[f7]                 \n\t"
+    "mul.s     %[f15],      %[f15],       %[f5]                 \n\t"
+    "sub.s     %[f4],       %[f6],        %[f4]                 \n\t"
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "add.s     %[f8],       %[f8],        %[f9]                 \n\t"
+    "sub.s     %[f10],      %[f10],       %[f15]                \n\t"
+#else
+    "swc1      %[f1],       0(%[a1])                            \n\t"
+    "swc1      %[f2],       0(%[a2])                            \n\t"
+    "sub.s     %[f4],       %[f6],        %[f4]                 \n\t"
+    "madd.s    %[f8],       %[f8],        %[f15],     %[f7]     \n\t"
+    "nmsub.s   %[f10],      %[f10],       %[f15],     %[f5]     \n\t"
+#endif
+    "swc1      %[f3],       4(%[a1])                            \n\t"
+    "swc1      %[f4],       4(%[a2])                            \n\t"
+    "sub.s     %[f11],      %[f11],       %[f8]                 \n\t"
+    "add.s     %[f12],      %[f12],       %[f8]                 \n\t"
+    "sub.s     %[f13],      %[f10],       %[f13]                \n\t"
+    "sub.s     %[f14],      %[f10],       %[f14]                \n\t"
+    "addiu     %[c2],       %[c2],        -8                    \n\t"
+    "addiu     %[c1],       %[c1],        8                     \n\t"
+    "swc1      %[f11],      8(%[a1])                            \n\t"
+    "swc1      %[f12],      -8(%[a2])                           \n\t"
+    "swc1      %[f13],      12(%[a1])                           \n\t"
+    "swc1      %[f14],      -4(%[a2])                           \n\t"
+    "addiu     %[a1],       %[a1],        16                    \n\t"
+    "addiu     %[count],    %[count],     -1                    \n\t"
+    "bgtz      %[count],    1b                                  \n\t"
+    " addiu    %[a2],       %[a2],        -16                   \n\t"
+    ".set      pop                                              \n\t"
+    : [a1] "+r" (a1), [a2] "+r" (a2), [c1] "+r" (c1), [c2] "+r" (c2),
+      [f1] "=&f" (f1), [f2] "=&f" (f2), [f3] "=&f" (f3), [f4] "=&f" (f4),
+      [f5] "=&f" (f5), [f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
+      [f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11), [f12] "=&f" (f12),
+      [f13] "=&f" (f13), [f14] "=&f" (f14), [f15] "=&f" (f15),
+      [count] "=&r" (count)
+    : [f0] "f" (f0)
+    : "memory"
+  );
+}
+
+void aec_rdft_init_mips(void) {
+  cft1st_128 = cft1st_128_mips;
+  cftmdl_128 = cftmdl_128_mips;
+  rftfsub_128 = rftfsub_128_mips;
+  rftbsub_128 = rftbsub_128_mips;
+  cftfsub_128 = cftfsub_128_mips;
+  cftbsub_128 = cftbsub_128_mips;
+  bitrv2_128 = bitrv2_128_mips;
+}
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft_neon.c b/webrtc/modules/audio_processing/aec/aec_rdft_neon.c
new file mode 100644
index 0000000000..43b6a68cd7
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_rdft_neon.c
@@ -0,0 +1,355 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The rdft AEC algorithm, neon version of speed-critical functions.
+ *
+ * Based on the sse2 version.
+ */
+
+
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+
+#include <arm_neon.h>
+
+static const ALIGN16_BEG float ALIGN16_END
+    k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
+
+static void cft1st_128_neon(float* a) {
+  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
+  int j, k2;
+
+  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
+    float32x4_t a00v = vld1q_f32(&a[j + 0]);
+    float32x4_t a04v = vld1q_f32(&a[j + 4]);
+    float32x4_t a08v = vld1q_f32(&a[j + 8]);
+    float32x4_t a12v = vld1q_f32(&a[j + 12]);
+    float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
+    float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
+    float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
+    float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
+    const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
+    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
+    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
+    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
+    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
+    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
+    float32x4_t x0v = vaddq_f32(a01v, a23v);
+    const float32x4_t x1v = vsubq_f32(a01v, a23v);
+    const float32x4_t x2v = vaddq_f32(a45v, a67v);
+    const float32x4_t x3v = vsubq_f32(a45v, a67v);
+    const float32x4_t x3w = vrev64q_f32(x3v);
+    float32x4_t x0w;
+    a01v = vaddq_f32(x0v, x2v);
+    x0v = vsubq_f32(x0v, x2v);
+    x0w = vrev64q_f32(x0v);
+    a45v = vmulq_f32(wk2rv, x0v);
+    a45v = vmlaq_f32(a45v, wk2iv, x0w);
+    x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
+    x0w = vrev64q_f32(x0v);
+    a23v = vmulq_f32(wk1rv, x0v);
+    a23v = vmlaq_f32(a23v, wk1iv, x0w);
+    x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
+    x0w = vrev64q_f32(x0v);
+    a67v = vmulq_f32(wk3rv, x0v);
+    a67v = vmlaq_f32(a67v, wk3iv, x0w);
+    a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
+    a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
+    a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
+    a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
+    vst1q_f32(&a[j + 0], a00v);
+    vst1q_f32(&a[j + 4], a04v);
+    vst1q_f32(&a[j + 8], a08v);
+    vst1q_f32(&a[j + 12], a12v);
+  }
+}
+
+static void cftmdl_128_neon(float* a) {
+  int j;
+  const int l = 8;
+  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
+  float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
+
+  for (j = 0; j < l; j += 2) {
+    const float32x2_t a_00 = vld1_f32(&a[j + 0]);
+    const float32x2_t a_08 = vld1_f32(&a[j + 8]);
+    const float32x2_t a_32 = vld1_f32(&a[j + 32]);
+    const float32x2_t a_40 = vld1_f32(&a[j + 40]);
+    const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
+    const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
+    const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
+    const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
+    const float32x2_t a_16 = vld1_f32(&a[j + 16]);
+    const float32x2_t a_24 = vld1_f32(&a[j + 24]);
+    const float32x2_t a_48 = vld1_f32(&a[j + 48]);
+    const float32x2_t a_56 = vld1_f32(&a[j + 56]);
+    const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
+    const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
+    const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
+    const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
+    const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
+    const float32x4_t x1_x3_add =
+        vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+    const float32x4_t x1_x3_sub =
+        vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+    const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
+    const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
+    const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
+    const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
+    const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
+    const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
+    const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
+    const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
+    const float32x4_t xx1_rev = vrev64q_f32(xx1);
+    const float32x4_t yy4_rev = vrev64q_f32(yy4);
+
+    vst1_f32(&a[j + 0], vget_low_f32(xx0));
+    vst1_f32(&a[j + 32], vget_high_f32(xx0));
+    vst1_f32(&a[j + 16], vget_low_f32(xx1));
+    vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
+
+    a[j + 48] = -a[j + 48];
+
+    vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
+    vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
+    vst1_f32(&a[j + 40], vget_low_f32(yy4));
+    vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
+  }
+
+  {
+    const int k = 64;
+    const int k1 = 2;
+    const int k2 = 2 * k1;
+    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
+    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
+    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
+    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
+    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
+    wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
+    for (j = k; j < l + k; j += 2) {
+      const float32x2_t a_00 = vld1_f32(&a[j + 0]);
+      const float32x2_t a_08 = vld1_f32(&a[j + 8]);
+      const float32x2_t a_32 = vld1_f32(&a[j + 32]);
+      const float32x2_t a_40 = vld1_f32(&a[j + 40]);
+      const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
+      const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
+      const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
+      const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
+      const float32x2_t a_16 = vld1_f32(&a[j + 16]);
+      const float32x2_t a_24 = vld1_f32(&a[j + 24]);
+      const float32x2_t a_48 = vld1_f32(&a[j + 48]);
+      const float32x2_t a_56 = vld1_f32(&a[j + 56]);
+      const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
+      const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
+      const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
+      const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
+      const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
+      const float32x4_t x1_x3_add =
+          vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+      const float32x4_t x1_x3_sub =
+          vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+      float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
+      float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
+      float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
+      xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
+      xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
+      xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
+
+      vst1_f32(&a[j + 0], vget_low_f32(xx));
+      vst1_f32(&a[j + 32], vget_high_f32(xx));
+      vst1_f32(&a[j + 16], vget_low_f32(xx4));
+      vst1_f32(&a[j + 48], vget_high_f32(xx4));
+      vst1_f32(&a[j + 8], vget_low_f32(xx12));
+      vst1_f32(&a[j + 40], vget_high_f32(xx12));
+      vst1_f32(&a[j + 24], vget_low_f32(xx22));
+      vst1_f32(&a[j + 56], vget_high_f32(xx22));
+    }
+  }
+}
+
+__inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
+  // A B C D -> C D A B
+  const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
+  // C D A B -> D C B A
+  return vrev64q_f32(rev);
+}
+
+static void rftfsub_128_neon(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2;
+  const float32x4_t mm_half = vdupq_n_f32(0.5f);
+
+  // Vectorized code (four at once).
+  // Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const float32x4_t c_j1 = vld1q_f32(&c[j1]);          //  1,  2,  3,  4,
+    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);     // 28, 29, 30, 31,
+    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);   // 28, 29, 30, 31,
+    const float32x4_t wkr_ = reverse_order_f32x4(wkrt);  // 31, 30, 29, 28,
+    const float32x4_t wki_ = c_j1;                       //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    //   2,   4,   6,   8,   3,   5,   7,   9
+    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
+    // 120, 122, 124, 126, 121, 123, 125, 127,
+    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
+    // 126, 124, 122, 120
+    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
+    // 127, 125, 123, 121
+    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
+    // Calculate 'x'.
+    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
+    const float32x4_t b_ = vmulq_f32(wki_, xi_);
+    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
+    const float32x4_t d_ = vmulq_f32(wki_, xr_);
+    const float32x4_t yr_ = vsubq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t yi_ = vaddq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                                // Update 'a'.
+                                                //    a[j2 + 0] -= yr;
+                                                //    a[j2 + 1] -= yi;
+                                                //    a[k2 + 0] += yr;
+                                                //    a[k2 + 1] -= yi;
+    // 126, 124, 122, 120,
+    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
+    // 127, 125, 123, 121,
+    const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
+    // Shuffle in right order and store.
+    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
+    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
+    // 124, 125, 126, 127, 120, 121, 122, 123
+    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
+    //   2,   4,   6,   8,
+    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
+    //   3,   5,   7,   9,
+    a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    vst2q_f32(&a[0 + j2], a_j2_p);
+
+    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
+    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
+  }
+
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    const int k2 = 128 - j2;
+    const int k1 = 32 - j1;
+    const float wkr = 0.5f - c[k1];
+    const float wki = c[j1];
+    const float xr = a[j2 + 0] - a[k2 + 0];
+    const float xi = a[j2 + 1] + a[k2 + 1];
+    const float yr = wkr * xr - wki * xi;
+    const float yi = wkr * xi + wki * xr;
+    a[j2 + 0] -= yr;
+    a[j2 + 1] -= yi;
+    a[k2 + 0] += yr;
+    a[k2 + 1] -= yi;
+  }
+}
+
+static void rftbsub_128_neon(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2;
+  const float32x4_t mm_half = vdupq_n_f32(0.5f);
+
+  a[1] = -a[1];
+  // Vectorized code (four at once).
+  //    Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const float32x4_t c_j1 = vld1q_f32(&c[j1]);         //  1,  2,  3,  4,
+    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);    // 28, 29, 30, 31,
+    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);  // 28, 29, 30, 31,
+    const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
+    const float32x4_t wki_ = c_j1;                      //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    //   2,   4,   6,   8,   3,   5,   7,   9
+    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
+    // 120, 122, 124, 126, 121, 123, 125, 127,
+    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
+    // 126, 124, 122, 120
+    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
+    // 127, 125, 123, 121
+    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
+    // Calculate 'x'.
+    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
+    const float32x4_t b_ = vmulq_f32(wki_, xi_);
+    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
+    const float32x4_t d_ = vmulq_f32(wki_, xr_);
+    const float32x4_t yr_ = vaddq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t yi_ = vsubq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                                // Update 'a'.
+                                                //    a[j2 + 0] -= yr;
+                                                //    a[j2 + 1] -= yi;
+                                                //    a[k2 + 0] += yr;
+                                                //    a[k2 + 1] -= yi;
+    // 126, 124, 122, 120,
+    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
+    // 127, 125, 123, 121,
+    const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
+    // Shuffle in right order and store.
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
+    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
+    // 124, 125, 126, 127, 120, 121, 122, 123
+    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
+    //   2,   4,   6,   8,
+    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
+    //   3,   5,   7,   9,
+    a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    vst2q_f32(&a[0 + j2], a_j2_p);
+
+    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
+    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
+  }
+
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    const int k2 = 128 - j2;
+    const int k1 = 32 - j1;
+    const float wkr = 0.5f - c[k1];
+    const float wki = c[j1];
+    const float xr = a[j2 + 0] - a[k2 + 0];
+    const float xi = a[j2 + 1] + a[k2 + 1];
+    const float yr = wkr * xr + wki * xi;
+    const float yi = wkr * xi - wki * xr;
+    a[j2 + 0] = a[j2 + 0] - yr;
+    a[j2 + 1] = yi - a[j2 + 1];
+    a[k2 + 0] = yr + a[k2 + 0];
+    a[k2 + 1] = yi - a[k2 + 1];
+  }
+  a[65] = -a[65];
+}
+
+void aec_rdft_init_neon(void) {
+  cft1st_128 = cft1st_128_neon;
+  cftmdl_128 = cftmdl_128_neon;
+  rftfsub_128 = rftfsub_128_neon;
+  rftbsub_128 = rftbsub_128_neon;
+}
+
diff --git a/webrtc/modules/audio_processing/aec/aec_rdft_sse2.c b/webrtc/modules/audio_processing/aec/aec_rdft_sse2.c
new file mode 100644
index 0000000000..b4e453ff53
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_rdft_sse2.c
@@ -0,0 +1,427 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
+
+#include <emmintrin.h>
+
+static const ALIGN16_BEG float ALIGN16_END
+    k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
+
+static void cft1st_128_SSE2(float* a) {
+  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
+  int j, k2;
+
+  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
+    __m128 a00v = _mm_loadu_ps(&a[j + 0]);
+    __m128 a04v = _mm_loadu_ps(&a[j + 4]);
+    __m128 a08v = _mm_loadu_ps(&a[j + 8]);
+    __m128 a12v = _mm_loadu_ps(&a[j + 12]);
+    __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
+    __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
+    __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
+    __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
+
+    const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
+    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
+    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
+    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
+    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
+    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
+    __m128 x0v = _mm_add_ps(a01v, a23v);
+    const __m128 x1v = _mm_sub_ps(a01v, a23v);
+    const __m128 x2v = _mm_add_ps(a45v, a67v);
+    const __m128 x3v = _mm_sub_ps(a45v, a67v);
+    __m128 x0w;
+    a01v = _mm_add_ps(x0v, x2v);
+    x0v = _mm_sub_ps(x0v, x2v);
+    x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
+    {
+      const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
+      const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
+      a45v = _mm_add_ps(a45_0v, a45_1v);
+    }
+    {
+      __m128 a23_0v, a23_1v;
+      const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
+      x0v = _mm_add_ps(x1v, x3s);
+      x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
+      a23_0v = _mm_mul_ps(wk1rv, x0v);
+      a23_1v = _mm_mul_ps(wk1iv, x0w);
+      a23v = _mm_add_ps(a23_0v, a23_1v);
+
+      x0v = _mm_sub_ps(x1v, x3s);
+      x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
+    }
+    {
+      const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
+      const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
+      a67v = _mm_add_ps(a67_0v, a67_1v);
+    }
+
+    a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
+    a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
+    a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
+    a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
+    _mm_storeu_ps(&a[j + 0], a00v);
+    _mm_storeu_ps(&a[j + 4], a04v);
+    _mm_storeu_ps(&a[j + 8], a08v);
+    _mm_storeu_ps(&a[j + 12], a12v);
+  }
+}
+
+static void cftmdl_128_SSE2(float* a) {
+  const int l = 8;
+  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
+  int j0;
+
+  __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
+  for (j0 = 0; j0 < l; j0 += 2) {
+    const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
+    const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
+    const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
+    const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
+    const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
+                                          _mm_castsi128_ps(a_32),
+                                          _MM_SHUFFLE(1, 0, 1, 0));
+    const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
+                                          _mm_castsi128_ps(a_40),
+                                          _MM_SHUFFLE(1, 0, 1, 0));
+    __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
+    const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
+
+    const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
+    const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
+    const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
+    const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
+    const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
+                                          _mm_castsi128_ps(a_48),
+                                          _MM_SHUFFLE(1, 0, 1, 0));
+    const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
+                                          _mm_castsi128_ps(a_56),
+                                          _MM_SHUFFLE(1, 0, 1, 0));
+    const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
+    const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
+
+    const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+
+    const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
+        _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
+    const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
+    const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+    const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+
+    const __m128 yy0 =
+        _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
+    const __m128 yy1 =
+        _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
+    const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
+    const __m128 yy3 = _mm_add_ps(yy0, yy2);
+    const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
+    _mm_storel_epi64(
+        (__m128i*)&a[j0 + 32],
+        _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
+    _mm_storel_epi64(
+        (__m128i*)&a[j0 + 48],
+        _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
+    a[j0 + 48] = -a[j0 + 48];
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
+    _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
+
+    _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
+    _mm_storel_epi64(
+        (__m128i*)&a[j0 + 56],
+        _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
+  }
+
+  {
+    int k = 64;
+    int k1 = 2;
+    int k2 = 2 * k1;
+    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
+    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
+    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
+    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
+    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
+    wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
+    for (j0 = k; j0 < l + k; j0 += 2) {
+      const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
+      const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
+      const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
+      const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
+      const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
+                                            _mm_castsi128_ps(a_32),
+                                            _MM_SHUFFLE(1, 0, 1, 0));
+      const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
+                                            _mm_castsi128_ps(a_40),
+                                            _MM_SHUFFLE(1, 0, 1, 0));
+      __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
+      const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
+
+      const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
+      const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
+      const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
+      const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
+      const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
+                                            _mm_castsi128_ps(a_48),
+                                            _MM_SHUFFLE(1, 0, 1, 0));
+      const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
+                                            _mm_castsi128_ps(a_56),
+                                            _MM_SHUFFLE(1, 0, 1, 0));
+      const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
+      const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
+
+      const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
+      const __m128 xx3 =
+          _mm_mul_ps(wk2iv,
+                     _mm_castsi128_ps(_mm_shuffle_epi32(
+                         _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx4 = _mm_add_ps(xx2, xx3);
+
+      const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
+          _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
+      const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
+      const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+      const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
+
+      const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
+      const __m128 xx11 = _mm_mul_ps(
+          wk1iv,
+          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
+                                             _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx12 = _mm_add_ps(xx10, xx11);
+
+      const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
+      const __m128 xx21 = _mm_mul_ps(
+          wk3iv,
+          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
+                                             _MM_SHUFFLE(2, 3, 0, 1))));
+      const __m128 xx22 = _mm_add_ps(xx20, xx21);
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 32],
+          _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 48],
+          _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 40],
+          _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
+
+      _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
+      _mm_storel_epi64(
+          (__m128i*)&a[j0 + 56],
+          _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
+    }
+  }
+}
+
+static void rftfsub_128_SSE2(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  static const ALIGN16_BEG float ALIGN16_END
+      k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
+  const __m128 mm_half = _mm_load_ps(k_half);
+
+  // Vectorized code (four at once).
+  //    Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
+    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
+    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
+    const __m128 wkr_ =
+        _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
+    const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
+    const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
+    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
+    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
+    const __m128 a_j2_p0 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
+    const __m128 a_j2_p1 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
+    const __m128 a_k2_p0 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
+    const __m128 a_k2_p1 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
+    // Calculate 'x'.
+    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
+    const __m128 b_ = _mm_mul_ps(wki_, xi_);
+    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
+    const __m128 d_ = _mm_mul_ps(wki_, xr_);
+    const __m128 yr_ = _mm_sub_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const __m128 yi_ = _mm_add_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                            // Update 'a'.
+                                            //    a[j2 + 0] -= yr;
+                                            //    a[j2 + 1] -= yi;
+                                            //    a[k2 + 0] += yr;
+    //    a[k2 + 1] -= yi;
+    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
+    const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_);  //   3,   5,   7,   9,
+    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
+    const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_);  // 127, 125, 123, 121,
+    // Shuffle in right order and store.
+    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
+    //   2,   3,   4,   5,
+    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
+    //   6,   7,   8,   9,
+    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
+    // 122, 123, 120, 121,
+    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
+    // 126, 127, 124, 125,
+    const __m128 a_k2_0n = _mm_shuffle_ps(
+        a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
+    const __m128 a_k2_4n = _mm_shuffle_ps(
+        a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
+    _mm_storeu_ps(&a[0 + j2], a_j2_0n);
+    _mm_storeu_ps(&a[4 + j2], a_j2_4n);
+    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
+    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
+  }
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 = 32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr - wki * xi;
+    yi = wkr * xi + wki * xr;
+    a[j2 + 0] -= yr;
+    a[j2 + 1] -= yi;
+    a[k2 + 0] += yr;
+    a[k2 + 1] -= yi;
+  }
+}
+
+static void rftbsub_128_SSE2(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2, k1, k2;
+  float wkr, wki, xr, xi, yr, yi;
+
+  static const ALIGN16_BEG float ALIGN16_END
+      k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
+  const __m128 mm_half = _mm_load_ps(k_half);
+
+  a[1] = -a[1];
+  // Vectorized code (four at once).
+  //    Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
+    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
+    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
+    const __m128 wkr_ =
+        _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
+    const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
+    const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
+    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
+    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
+    const __m128 a_j2_p0 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
+    const __m128 a_j2_p1 = _mm_shuffle_ps(
+        a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
+    const __m128 a_k2_p0 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
+    const __m128 a_k2_p1 = _mm_shuffle_ps(
+        a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
+    // Calculate 'x'.
+    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr + wki * xi;
+    //    yi = wkr * xi - wki * xr;
+    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
+    const __m128 b_ = _mm_mul_ps(wki_, xi_);
+    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
+    const __m128 d_ = _mm_mul_ps(wki_, xr_);
+    const __m128 yr_ = _mm_add_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const __m128 yi_ = _mm_sub_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                            // Update 'a'.
+                                            //    a[j2 + 0] = a[j2 + 0] - yr;
+                                            //    a[j2 + 1] = yi - a[j2 + 1];
+                                            //    a[k2 + 0] = yr + a[k2 + 0];
+    //    a[k2 + 1] = yi - a[k2 + 1];
+    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
+    const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1);  //   3,   5,   7,   9,
+    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
+    const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121,
+    // Shuffle in right order and store.
+    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
+    //   2,   3,   4,   5,
+    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
+    //   6,   7,   8,   9,
+    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
+    // 122, 123, 120, 121,
+    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
+    // 126, 127, 124, 125,
+    const __m128 a_k2_0n = _mm_shuffle_ps(
+        a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
+    const __m128 a_k2_4n = _mm_shuffle_ps(
+        a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
+    _mm_storeu_ps(&a[0 + j2], a_j2_0n);
+    _mm_storeu_ps(&a[4 + j2], a_j2_4n);
+    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
+    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
+  }
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    k2 = 128 - j2;
+    k1 = 32 - j1;
+    wkr = 0.5f - c[k1];
+    wki = c[j1];
+    xr = a[j2 + 0] - a[k2 + 0];
+    xi = a[j2 + 1] + a[k2 + 1];
+    yr = wkr * xr + wki * xi;
+    yi = wkr * xi - wki * xr;
+    a[j2 + 0] = a[j2 + 0] - yr;
+    a[j2 + 1] = yi - a[j2 + 1];
+    a[k2 + 0] = yr + a[k2 + 0];
+    a[k2 + 1] = yi - a[k2 + 1];
+  }
+  a[65] = -a[65];
+}
+
+void aec_rdft_init_sse2(void) {
+  cft1st_128 = cft1st_128_SSE2;
+  cftmdl_128 = cftmdl_128_SSE2;
+  rftfsub_128 = rftfsub_128_SSE2;
+  rftbsub_128 = rftbsub_128_SSE2;
+}
diff --git a/webrtc/modules/audio_processing/aec/aec_resampler.c b/webrtc/modules/audio_processing/aec/aec_resampler.c
new file mode 100644
index 0000000000..99c39efa88
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_resampler.c
@@ -0,0 +1,209 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for
+ * clock skew by resampling the farend signal.
+ */
+
+#include "webrtc/modules/audio_processing/aec/aec_resampler.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+
+enum {
+  kEstimateLengthFrames = 400
+};
+
+typedef struct {
+  float buffer[kResamplerBufferSize];
+  float position;
+
+  int deviceSampleRateHz;
+  int skewData[kEstimateLengthFrames];
+  int skewDataIndex;
+  float skewEstimate;
+} AecResampler;
+
+static int EstimateSkew(const int* rawSkew,
+                        int size,
+                        int absLimit,
+                        float* skewEst);
+
+void* WebRtcAec_CreateResampler() {
+  return malloc(sizeof(AecResampler));
+}
+
+int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz) {
+  AecResampler* obj = (AecResampler*)resampInst;
+  memset(obj->buffer, 0, sizeof(obj->buffer));
+  obj->position = 0.0;
+
+  obj->deviceSampleRateHz = deviceSampleRateHz;
+  memset(obj->skewData, 0, sizeof(obj->skewData));
+  obj->skewDataIndex = 0;
+  obj->skewEstimate = 0.0;
+
+  return 0;
+}
+
+void WebRtcAec_FreeResampler(void* resampInst) {
+  AecResampler* obj = (AecResampler*)resampInst;
+  free(obj);
+}
+
+void WebRtcAec_ResampleLinear(void* resampInst,
+                              const float* inspeech,
+                              size_t size,
+                              float skew,
+                              float* outspeech,
+                              size_t* size_out) {
+  AecResampler* obj = (AecResampler*)resampInst;
+
+  float* y;
+  float be, tnew;
+  size_t tn, mm;
+
+  assert(size <= 2 * FRAME_LEN);
+  assert(resampInst != NULL);
+  assert(inspeech != NULL);
+  assert(outspeech != NULL);
+  assert(size_out != NULL);
+
+  // Add new frame data in lookahead
+  memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay],
+         inspeech,
+         size * sizeof(inspeech[0]));
+
+  // Sample rate ratio
+  be = 1 + skew;
+
+  // Loop over input frame
+  mm = 0;
+  y = &obj->buffer[FRAME_LEN];  // Point at current frame
+
+  tnew = be * mm + obj->position;
+  tn = (size_t)tnew;
+
+  while (tn < size) {
+
+    // Interpolation
+    outspeech[mm] = y[tn] + (tnew - tn) * (y[tn + 1] - y[tn]);
+    mm++;
+
+    tnew = be * mm + obj->position;
+    tn = (int)tnew;
+  }
+
+  *size_out = mm;
+  obj->position += (*size_out) * be - size;
+
+  // Shift buffer
+  memmove(obj->buffer,
+          &obj->buffer[size],
+          (kResamplerBufferSize - size) * sizeof(obj->buffer[0]));
+}
+
+int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst) {
+  AecResampler* obj = (AecResampler*)resampInst;
+  int err = 0;
+
+  if (obj->skewDataIndex < kEstimateLengthFrames) {
+    obj->skewData[obj->skewDataIndex] = rawSkew;
+    obj->skewDataIndex++;
+  } else if (obj->skewDataIndex == kEstimateLengthFrames) {
+    err = EstimateSkew(
+        obj->skewData, kEstimateLengthFrames, obj->deviceSampleRateHz, skewEst);
+    obj->skewEstimate = *skewEst;
+    obj->skewDataIndex++;
+  } else {
+    *skewEst = obj->skewEstimate;
+  }
+
+  return err;
+}
+
+int EstimateSkew(const int* rawSkew,
+                 int size,
+                 int deviceSampleRateHz,
+                 float* skewEst) {
+  const int absLimitOuter = (int)(0.04f * deviceSampleRateHz);
+  const int absLimitInner = (int)(0.0025f * deviceSampleRateHz);
+  int i = 0;
+  int n = 0;
+  float rawAvg = 0;
+  float err = 0;
+  float rawAbsDev = 0;
+  int upperLimit = 0;
+  int lowerLimit = 0;
+  float cumSum = 0;
+  float x = 0;
+  float x2 = 0;
+  float y = 0;
+  float xy = 0;
+  float xAvg = 0;
+  float denom = 0;
+  float skew = 0;
+
+  *skewEst = 0;  // Set in case of error below.
+  for (i = 0; i < size; i++) {
+    if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
+      n++;
+      rawAvg += rawSkew[i];
+    }
+  }
+
+  if (n == 0) {
+    return -1;
+  }
+  assert(n > 0);
+  rawAvg /= n;
+
+  for (i = 0; i < size; i++) {
+    if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
+      err = rawSkew[i] - rawAvg;
+      rawAbsDev += err >= 0 ? err : -err;
+    }
+  }
+  assert(n > 0);
+  rawAbsDev /= n;
+  upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1);  // +1 for ceiling.
+  lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1);  // -1 for floor.
+
+  n = 0;
+  for (i = 0; i < size; i++) {
+    if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) ||
+        (rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) {
+      n++;
+      cumSum += rawSkew[i];
+      x += n;
+      x2 += n * n;
+      y += cumSum;
+      xy += n * cumSum;
+    }
+  }
+
+  if (n == 0) {
+    return -1;
+  }
+  assert(n > 0);
+  xAvg = x / n;
+  denom = x2 - xAvg * x;
+
+  if (denom != 0) {
+    skew = (xy - xAvg * y) / denom;
+  }
+
+  *skewEst = skew;
+  return 0;
+}
diff --git a/webrtc/modules/audio_processing/aec/aec_resampler.h b/webrtc/modules/audio_processing/aec/aec_resampler.h
new file mode 100644
index 0000000000..a5002c155a
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/aec_resampler.h
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
+
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+
+enum {
+  kResamplingDelay = 1
+};
+enum {
+  kResamplerBufferSize = FRAME_LEN * 4
+};
+
+// Unless otherwise specified, functions return 0 on success and -1 on error.
+void* WebRtcAec_CreateResampler();  // Returns NULL on error.
+int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz);
+void WebRtcAec_FreeResampler(void* resampInst);
+
+// Estimates skew from raw measurement.
+int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst);
+
+// Resamples input using linear interpolation.
+void WebRtcAec_ResampleLinear(void* resampInst,
+                              const float* inspeech,
+                              size_t size,
+                              float skew,
+                              float* outspeech,
+                              size_t* size_out);
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
diff --git a/webrtc/modules/audio_processing/aec/echo_cancellation.c b/webrtc/modules/audio_processing/aec/echo_cancellation.c
new file mode 100644
index 0000000000..0f5cd31ddb
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/echo_cancellation.c
@@ -0,0 +1,923 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Contains the API functions for the AEC.
+ */
+#include "webrtc/modules/audio_processing/aec/include/echo_cancellation.h"
+
+#include <math.h>
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+#include <stdio.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+
+#include "webrtc/common_audio/ring_buffer.h"
+#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+#include "webrtc/modules/audio_processing/aec/aec_resampler.h"
+#include "webrtc/modules/audio_processing/aec/echo_cancellation_internal.h"
+#include "webrtc/typedefs.h"
+
+// Measured delays [ms]
+// Device                Chrome  GTP
+// MacBook Air           10
+// MacBook Retina        10      100
+// MacPro                30?
+//
+// Win7 Desktop          70      80?
+// Win7 T430s            110
+// Win8 T420s            70
+//
+// Daisy                 50
+// Pixel (w/ preproc?)           240
+// Pixel (w/o preproc?)  110     110
+
+// The extended filter mode gives us the flexibility to ignore the system's
+// reported delays. We do this for platforms which we believe provide results
+// which are incompatible with the AEC's expectations. Based on measurements
+// (some provided above) we set a conservative (i.e. lower than measured)
+// fixed delay.
+//
+// WEBRTC_UNTRUSTED_DELAY will only have an impact when |extended_filter_mode|
+// is enabled. See the note along with |DelayCorrection| in
+// echo_cancellation_impl.h for more details on the mode.
+//
+// Justification:
+// Chromium/Mac: Here, the true latency is so low (~10-20 ms), that it plays
+// havoc with the AEC's buffering. To avoid this, we set a fixed delay of 20 ms
+// and then compensate by rewinding by 10 ms (in wideband) through
+// kDelayDiffOffsetSamples. This trick does not seem to work for larger rewind
+// values, but fortunately this is sufficient.
+//
+// Chromium/Linux(ChromeOS): The values we get on this platform don't correspond
+// well to reality. The variance doesn't match the AEC's buffer changes, and the
+// bulk values tend to be too low. However, the range across different hardware
+// appears to be too large to choose a single value.
+//
+// GTP/Linux(ChromeOS): TBD, but for the moment we will trust the values.
+#if defined(WEBRTC_CHROMIUM_BUILD) && defined(WEBRTC_MAC)
+#define WEBRTC_UNTRUSTED_DELAY
+#endif
+
+#if defined(WEBRTC_UNTRUSTED_DELAY) && defined(WEBRTC_MAC)
+static const int kDelayDiffOffsetSamples = -160;
+#else
+// Not enabled for now.
+static const int kDelayDiffOffsetSamples = 0;
+#endif
+
+#if defined(WEBRTC_MAC)
+static const int kFixedDelayMs = 20;
+#else
+static const int kFixedDelayMs = 50;
+#endif
+#if !defined(WEBRTC_UNTRUSTED_DELAY)
+static const int kMinTrustedDelayMs = 20;
+#endif
+static const int kMaxTrustedDelayMs = 500;
+
+// Maximum length of resampled signal. Must be an integer multiple of frames
+// (ceil(1/(1 + MIN_SKEW)*2) + 1)*FRAME_LEN
+// The factor of 2 handles wb, and the + 1 is as a safety margin
+// TODO(bjornv): Replace with kResamplerBufferSize
+#define MAX_RESAMP_LEN (5 * FRAME_LEN)
+
+static const int kMaxBufSizeStart = 62;  // In partitions
+static const int sampMsNb = 8;           // samples per ms in nb
+static const int initCheck = 42;
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+int webrtc_aec_instance_count = 0;
+#endif
+
+// Estimates delay to set the position of the far-end buffer read pointer
+// (controlled by knownDelay)
+static void EstBufDelayNormal(Aec* aecInst);
+static void EstBufDelayExtended(Aec* aecInst);
+static int ProcessNormal(Aec* self,
+                         const float* const* near,
+                         size_t num_bands,
+                         float* const* out,
+                         size_t num_samples,
+                         int16_t reported_delay_ms,
+                         int32_t skew);
+static void ProcessExtended(Aec* self,
+                            const float* const* near,
+                            size_t num_bands,
+                            float* const* out,
+                            size_t num_samples,
+                            int16_t reported_delay_ms,
+                            int32_t skew);
+
+void* WebRtcAec_Create() {
+  Aec* aecpc = malloc(sizeof(Aec));
+
+  if (!aecpc) {
+    return NULL;
+  }
+
+  aecpc->aec = WebRtcAec_CreateAec();
+  if (!aecpc->aec) {
+    WebRtcAec_Free(aecpc);
+    return NULL;
+  }
+  aecpc->resampler = WebRtcAec_CreateResampler();
+  if (!aecpc->resampler) {
+    WebRtcAec_Free(aecpc);
+    return NULL;
+  }
+  // Create far-end pre-buffer. The buffer size has to be large enough for
+  // largest possible drift compensation (kResamplerBufferSize) + "almost" an
+  // FFT buffer (PART_LEN2 - 1).
+  aecpc->far_pre_buf =
+      WebRtc_CreateBuffer(PART_LEN2 + kResamplerBufferSize, sizeof(float));
+  if (!aecpc->far_pre_buf) {
+    WebRtcAec_Free(aecpc);
+    return NULL;
+  }
+
+  aecpc->initFlag = 0;
+  aecpc->lastError = 0;
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  {
+    char filename[64];
+    sprintf(filename, "aec_buf%d.dat", webrtc_aec_instance_count);
+    aecpc->bufFile = fopen(filename, "wb");
+    sprintf(filename, "aec_skew%d.dat", webrtc_aec_instance_count);
+    aecpc->skewFile = fopen(filename, "wb");
+    sprintf(filename, "aec_delay%d.dat", webrtc_aec_instance_count);
+    aecpc->delayFile = fopen(filename, "wb");
+    webrtc_aec_instance_count++;
+  }
+#endif
+
+  return aecpc;
+}
+
+void WebRtcAec_Free(void* aecInst) {
+  Aec* aecpc = aecInst;
+
+  if (aecpc == NULL) {
+    return;
+  }
+
+  WebRtc_FreeBuffer(aecpc->far_pre_buf);
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  fclose(aecpc->bufFile);
+  fclose(aecpc->skewFile);
+  fclose(aecpc->delayFile);
+#endif
+
+  WebRtcAec_FreeAec(aecpc->aec);
+  WebRtcAec_FreeResampler(aecpc->resampler);
+  free(aecpc);
+}
+
+int32_t WebRtcAec_Init(void* aecInst, int32_t sampFreq, int32_t scSampFreq) {
+  Aec* aecpc = aecInst;
+  AecConfig aecConfig;
+
+  if (sampFreq != 8000 &&
+      sampFreq != 16000 &&
+      sampFreq != 32000 &&
+      sampFreq != 48000) {
+    aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+    return -1;
+  }
+  aecpc->sampFreq = sampFreq;
+
+  if (scSampFreq < 1 || scSampFreq > 96000) {
+    aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+    return -1;
+  }
+  aecpc->scSampFreq = scSampFreq;
+
+  // Initialize echo canceller core
+  if (WebRtcAec_InitAec(aecpc->aec, aecpc->sampFreq) == -1) {
+    aecpc->lastError = AEC_UNSPECIFIED_ERROR;
+    return -1;
+  }
+
+  if (WebRtcAec_InitResampler(aecpc->resampler, aecpc->scSampFreq) == -1) {
+    aecpc->lastError = AEC_UNSPECIFIED_ERROR;
+    return -1;
+  }
+
+  WebRtc_InitBuffer(aecpc->far_pre_buf);
+  WebRtc_MoveReadPtr(aecpc->far_pre_buf, -PART_LEN);  // Start overlap.
+
+  aecpc->initFlag = initCheck;  // indicates that initialization has been done
+
+  if (aecpc->sampFreq == 32000 || aecpc->sampFreq == 48000) {
+    aecpc->splitSampFreq = 16000;
+  } else {
+    aecpc->splitSampFreq = sampFreq;
+  }
+
+  aecpc->delayCtr = 0;
+  aecpc->sampFactor = (aecpc->scSampFreq * 1.0f) / aecpc->splitSampFreq;
+  // Sampling frequency multiplier (SWB is processed as 160 frame size).
+  aecpc->rate_factor = aecpc->splitSampFreq / 8000;
+
+  aecpc->sum = 0;
+  aecpc->counter = 0;
+  aecpc->checkBuffSize = 1;
+  aecpc->firstVal = 0;
+
+  // We skip the startup_phase completely (setting to 0) if DA-AEC is enabled,
+  // but not extended_filter mode.
+  aecpc->startup_phase = WebRtcAec_extended_filter_enabled(aecpc->aec) ||
+      !WebRtcAec_delay_agnostic_enabled(aecpc->aec);
+  aecpc->bufSizeStart = 0;
+  aecpc->checkBufSizeCtr = 0;
+  aecpc->msInSndCardBuf = 0;
+  aecpc->filtDelay = -1;  // -1 indicates an initialized state.
+  aecpc->timeForDelayChange = 0;
+  aecpc->knownDelay = 0;
+  aecpc->lastDelayDiff = 0;
+
+  aecpc->skewFrCtr = 0;
+  aecpc->resample = kAecFalse;
+  aecpc->highSkewCtr = 0;
+  aecpc->skew = 0;
+
+  aecpc->farend_started = 0;
+
+  // Default settings.
+  aecConfig.nlpMode = kAecNlpModerate;
+  aecConfig.skewMode = kAecFalse;
+  aecConfig.metricsMode = kAecFalse;
+  aecConfig.delay_logging = kAecFalse;
+
+  if (WebRtcAec_set_config(aecpc, aecConfig) == -1) {
+    aecpc->lastError = AEC_UNSPECIFIED_ERROR;
+    return -1;
+  }
+
+  return 0;
+}
+
+// only buffer L band for farend
+int32_t WebRtcAec_BufferFarend(void* aecInst,
+                               const float* farend,
+                               size_t nrOfSamples) {
+  Aec* aecpc = aecInst;
+  size_t newNrOfSamples = nrOfSamples;
+  float new_farend[MAX_RESAMP_LEN];
+  const float* farend_ptr = farend;
+
+  if (farend == NULL) {
+    aecpc->lastError = AEC_NULL_POINTER_ERROR;
+    return -1;
+  }
+
+  if (aecpc->initFlag != initCheck) {
+    aecpc->lastError = AEC_UNINITIALIZED_ERROR;
+    return -1;
+  }
+
+  // number of samples == 160 for SWB input
+  if (nrOfSamples != 80 && nrOfSamples != 160) {
+    aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+    return -1;
+  }
+
+  if (aecpc->skewMode == kAecTrue && aecpc->resample == kAecTrue) {
+    // Resample and get a new number of samples
+    WebRtcAec_ResampleLinear(aecpc->resampler,
+                             farend,
+                             nrOfSamples,
+                             aecpc->skew,
+                             new_farend,
+                             &newNrOfSamples);
+    farend_ptr = new_farend;
+  }
+
+  aecpc->farend_started = 1;
+  WebRtcAec_SetSystemDelay(
+      aecpc->aec, WebRtcAec_system_delay(aecpc->aec) + (int)newNrOfSamples);
+
+  // Write the time-domain data to |far_pre_buf|.
+  WebRtc_WriteBuffer(aecpc->far_pre_buf, farend_ptr, newNrOfSamples);
+
+  // Transform to frequency domain if we have enough data.
+  while (WebRtc_available_read(aecpc->far_pre_buf) >= PART_LEN2) {
+    // We have enough data to pass to the FFT, hence read PART_LEN2 samples.
+    {
+      float* ptmp = NULL;
+      float tmp[PART_LEN2];
+      WebRtc_ReadBuffer(aecpc->far_pre_buf, (void**)&ptmp, tmp, PART_LEN2);
+      WebRtcAec_BufferFarendPartition(aecpc->aec, ptmp);
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+      WebRtc_WriteBuffer(
+          WebRtcAec_far_time_buf(aecpc->aec), &ptmp[PART_LEN], 1);
+#endif
+    }
+
+    // Rewind |far_pre_buf| PART_LEN samples for overlap before continuing.
+    WebRtc_MoveReadPtr(aecpc->far_pre_buf, -PART_LEN);
+  }
+
+  return 0;
+}
+
+int32_t WebRtcAec_Process(void* aecInst,
+                          const float* const* nearend,
+                          size_t num_bands,
+                          float* const* out,
+                          size_t nrOfSamples,
+                          int16_t msInSndCardBuf,
+                          int32_t skew) {
+  Aec* aecpc = aecInst;
+  int32_t retVal = 0;
+
+  if (out == NULL) {
+    aecpc->lastError = AEC_NULL_POINTER_ERROR;
+    return -1;
+  }
+
+  if (aecpc->initFlag != initCheck) {
+    aecpc->lastError = AEC_UNINITIALIZED_ERROR;
+    return -1;
+  }
+
+  // number of samples == 160 for SWB input
+  if (nrOfSamples != 80 && nrOfSamples != 160) {
+    aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
+    return -1;
+  }
+
+  if (msInSndCardBuf < 0) {
+    msInSndCardBuf = 0;
+    aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
+    retVal = -1;
+  } else if (msInSndCardBuf > kMaxTrustedDelayMs) {
+    // The clamping is now done in ProcessExtended/Normal().
+    aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
+    retVal = -1;
+  }
+
+  // This returns the value of aec->extended_filter_enabled.
+  if (WebRtcAec_extended_filter_enabled(aecpc->aec)) {
+    ProcessExtended(aecpc,
+                    nearend,
+                    num_bands,
+                    out,
+                    nrOfSamples,
+                    msInSndCardBuf,
+                    skew);
+  } else {
+    if (ProcessNormal(aecpc,
+                      nearend,
+                      num_bands,
+                      out,
+                      nrOfSamples,
+                      msInSndCardBuf,
+                      skew) != 0) {
+      retVal = -1;
+    }
+  }
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  {
+    int16_t far_buf_size_ms = (int16_t)(WebRtcAec_system_delay(aecpc->aec) /
+                                        (sampMsNb * aecpc->rate_factor));
+    (void)fwrite(&far_buf_size_ms, 2, 1, aecpc->bufFile);
+    (void)fwrite(
+        &aecpc->knownDelay, sizeof(aecpc->knownDelay), 1, aecpc->delayFile);
+  }
+#endif
+
+  return retVal;
+}
+
+int WebRtcAec_set_config(void* handle, AecConfig config) {
+  Aec* self = (Aec*)handle;
+  if (self->initFlag != initCheck) {
+    self->lastError = AEC_UNINITIALIZED_ERROR;
+    return -1;
+  }
+
+  if (config.skewMode != kAecFalse && config.skewMode != kAecTrue) {
+    self->lastError = AEC_BAD_PARAMETER_ERROR;
+    return -1;
+  }
+  self->skewMode = config.skewMode;
+
+  if (config.nlpMode != kAecNlpConservative &&
+      config.nlpMode != kAecNlpModerate &&
+      config.nlpMode != kAecNlpAggressive) {
+    self->lastError = AEC_BAD_PARAMETER_ERROR;
+    return -1;
+  }
+
+  if (config.metricsMode != kAecFalse && config.metricsMode != kAecTrue) {
+    self->lastError = AEC_BAD_PARAMETER_ERROR;
+    return -1;
+  }
+
+  if (config.delay_logging != kAecFalse && config.delay_logging != kAecTrue) {
+    self->lastError = AEC_BAD_PARAMETER_ERROR;
+    return -1;
+  }
+
+  WebRtcAec_SetConfigCore(
+      self->aec, config.nlpMode, config.metricsMode, config.delay_logging);
+  return 0;
+}
+
+int WebRtcAec_get_echo_status(void* handle, int* status) {
+  Aec* self = (Aec*)handle;
+  if (status == NULL) {
+    self->lastError = AEC_NULL_POINTER_ERROR;
+    return -1;
+  }
+  if (self->initFlag != initCheck) {
+    self->lastError = AEC_UNINITIALIZED_ERROR;
+    return -1;
+  }
+
+  *status = WebRtcAec_echo_state(self->aec);
+
+  return 0;
+}
+
+int WebRtcAec_GetMetrics(void* handle, AecMetrics* metrics) {
+  const float kUpWeight = 0.7f;
+  float dtmp;
+  int stmp;
+  Aec* self = (Aec*)handle;
+  Stats erl;
+  Stats erle;
+  Stats a_nlp;
+
+  if (handle == NULL) {
+    return -1;
+  }
+  if (metrics == NULL) {
+    self->lastError = AEC_NULL_POINTER_ERROR;
+    return -1;
+  }
+  if (self->initFlag != initCheck) {
+    self->lastError = AEC_UNINITIALIZED_ERROR;
+    return -1;
+  }
+
+  WebRtcAec_GetEchoStats(self->aec, &erl, &erle, &a_nlp);
+
+  // ERL
+  metrics->erl.instant = (int)erl.instant;
+
+  if ((erl.himean > kOffsetLevel) && (erl.average > kOffsetLevel)) {
+    // Use a mix between regular average and upper part average.
+    dtmp = kUpWeight * erl.himean + (1 - kUpWeight) * erl.average;
+    metrics->erl.average = (int)dtmp;
+  } else {
+    metrics->erl.average = kOffsetLevel;
+  }
+
+  metrics->erl.max = (int)erl.max;
+
+  if (erl.min < (kOffsetLevel * (-1))) {
+    metrics->erl.min = (int)erl.min;
+  } else {
+    metrics->erl.min = kOffsetLevel;
+  }
+
+  // ERLE
+  metrics->erle.instant = (int)erle.instant;
+
+  if ((erle.himean > kOffsetLevel) && (erle.average > kOffsetLevel)) {
+    // Use a mix between regular average and upper part average.
+    dtmp = kUpWeight * erle.himean + (1 - kUpWeight) * erle.average;
+    metrics->erle.average = (int)dtmp;
+  } else {
+    metrics->erle.average = kOffsetLevel;
+  }
+
+  metrics->erle.max = (int)erle.max;
+
+  if (erle.min < (kOffsetLevel * (-1))) {
+    metrics->erle.min = (int)erle.min;
+  } else {
+    metrics->erle.min = kOffsetLevel;
+  }
+
+  // RERL
+  if ((metrics->erl.average > kOffsetLevel) &&
+      (metrics->erle.average > kOffsetLevel)) {
+    stmp = metrics->erl.average + metrics->erle.average;
+  } else {
+    stmp = kOffsetLevel;
+  }
+  metrics->rerl.average = stmp;
+
+  // No other statistics needed, but returned for completeness.
+  metrics->rerl.instant = stmp;
+  metrics->rerl.max = stmp;
+  metrics->rerl.min = stmp;
+
+  // A_NLP
+  metrics->aNlp.instant = (int)a_nlp.instant;
+
+  if ((a_nlp.himean > kOffsetLevel) && (a_nlp.average > kOffsetLevel)) {
+    // Use a mix between regular average and upper part average.
+    dtmp = kUpWeight * a_nlp.himean + (1 - kUpWeight) * a_nlp.average;
+    metrics->aNlp.average = (int)dtmp;
+  } else {
+    metrics->aNlp.average = kOffsetLevel;
+  }
+
+  metrics->aNlp.max = (int)a_nlp.max;
+
+  if (a_nlp.min < (kOffsetLevel * (-1))) {
+    metrics->aNlp.min = (int)a_nlp.min;
+  } else {
+    metrics->aNlp.min = kOffsetLevel;
+  }
+
+  return 0;
+}
+
+int WebRtcAec_GetDelayMetrics(void* handle,
+                              int* median,
+                              int* std,
+                              float* fraction_poor_delays) {
+  Aec* self = handle;
+  if (median == NULL) {
+    self->lastError = AEC_NULL_POINTER_ERROR;
+    return -1;
+  }
+  if (std == NULL) {
+    self->lastError = AEC_NULL_POINTER_ERROR;
+    return -1;
+  }
+  if (self->initFlag != initCheck) {
+    self->lastError = AEC_UNINITIALIZED_ERROR;
+    return -1;
+  }
+  if (WebRtcAec_GetDelayMetricsCore(self->aec, median, std,
+                                    fraction_poor_delays) ==
+      -1) {
+    // Logging disabled.
+    self->lastError = AEC_UNSUPPORTED_FUNCTION_ERROR;
+    return -1;
+  }
+
+  return 0;
+}
+
+int32_t WebRtcAec_get_error_code(void* aecInst) {
+  Aec* aecpc = aecInst;
+  return aecpc->lastError;
+}
+
+AecCore* WebRtcAec_aec_core(void* handle) {
+  if (!handle) {
+    return NULL;
+  }
+  return ((Aec*)handle)->aec;
+}
+
+static int ProcessNormal(Aec* aecpc,
+                         const float* const* nearend,
+                         size_t num_bands,
+                         float* const* out,
+                         size_t nrOfSamples,
+                         int16_t msInSndCardBuf,
+                         int32_t skew) {
+  int retVal = 0;
+  size_t i;
+  size_t nBlocks10ms;
+  // Limit resampling to doubling/halving of signal
+  const float minSkewEst = -0.5f;
+  const float maxSkewEst = 1.0f;
+
+  msInSndCardBuf =
+      msInSndCardBuf > kMaxTrustedDelayMs ? kMaxTrustedDelayMs : msInSndCardBuf;
+  // TODO(andrew): we need to investigate if this +10 is really wanted.
+  msInSndCardBuf += 10;
+  aecpc->msInSndCardBuf = msInSndCardBuf;
+
+  if (aecpc->skewMode == kAecTrue) {
+    if (aecpc->skewFrCtr < 25) {
+      aecpc->skewFrCtr++;
+    } else {
+      retVal = WebRtcAec_GetSkew(aecpc->resampler, skew, &aecpc->skew);
+      if (retVal == -1) {
+        aecpc->skew = 0;
+        aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
+      }
+
+      aecpc->skew /= aecpc->sampFactor * nrOfSamples;
+
+      if (aecpc->skew < 1.0e-3 && aecpc->skew > -1.0e-3) {
+        aecpc->resample = kAecFalse;
+      } else {
+        aecpc->resample = kAecTrue;
+      }
+
+      if (aecpc->skew < minSkewEst) {
+        aecpc->skew = minSkewEst;
+      } else if (aecpc->skew > maxSkewEst) {
+        aecpc->skew = maxSkewEst;
+      }
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+      (void)fwrite(&aecpc->skew, sizeof(aecpc->skew), 1, aecpc->skewFile);
+#endif
+    }
+  }
+
+  nBlocks10ms = nrOfSamples / (FRAME_LEN * aecpc->rate_factor);
+
+  if (aecpc->startup_phase) {
+    for (i = 0; i < num_bands; ++i) {
+      // Only needed if they don't already point to the same place.
+      if (nearend[i] != out[i]) {
+        memcpy(out[i], nearend[i], sizeof(nearend[i][0]) * nrOfSamples);
+      }
+    }
+
+    // The AEC is in the start up mode
+    // AEC is disabled until the system delay is OK
+
+    // Mechanism to ensure that the system delay is reasonably stable.
+    if (aecpc->checkBuffSize) {
+      aecpc->checkBufSizeCtr++;
+      // Before we fill up the far-end buffer we require the system delay
+      // to be stable (+/-8 ms) compared to the first value. This
+      // comparison is made during the following 6 consecutive 10 ms
+      // blocks. If it seems to be stable then we start to fill up the
+      // far-end buffer.
+      if (aecpc->counter == 0) {
+        aecpc->firstVal = aecpc->msInSndCardBuf;
+        aecpc->sum = 0;
+      }
+
+      if (abs(aecpc->firstVal - aecpc->msInSndCardBuf) <
+          WEBRTC_SPL_MAX(0.2 * aecpc->msInSndCardBuf, sampMsNb)) {
+        aecpc->sum += aecpc->msInSndCardBuf;
+        aecpc->counter++;
+      } else {
+        aecpc->counter = 0;
+      }
+
+      if (aecpc->counter * nBlocks10ms >= 6) {
+        // The far-end buffer size is determined in partitions of
+        // PART_LEN samples. Use 75% of the average value of the system
+        // delay as buffer size to start with.
+        aecpc->bufSizeStart =
+            WEBRTC_SPL_MIN((3 * aecpc->sum * aecpc->rate_factor * 8) /
+                               (4 * aecpc->counter * PART_LEN),
+                           kMaxBufSizeStart);
+        // Buffer size has now been determined.
+        aecpc->checkBuffSize = 0;
+      }
+
+      if (aecpc->checkBufSizeCtr * nBlocks10ms > 50) {
+        // For really bad systems, don't disable the echo canceller for
+        // more than 0.5 sec.
+        aecpc->bufSizeStart = WEBRTC_SPL_MIN(
+            (aecpc->msInSndCardBuf * aecpc->rate_factor * 3) / 40,
+            kMaxBufSizeStart);
+        aecpc->checkBuffSize = 0;
+      }
+    }
+
+    // If |checkBuffSize| changed in the if-statement above.
+    if (!aecpc->checkBuffSize) {
+      // The system delay is now reasonably stable (or has been unstable
+      // for too long). When the far-end buffer is filled with
+      // approximately the same amount of data as reported by the system
+      // we end the startup phase.
+      int overhead_elements =
+          WebRtcAec_system_delay(aecpc->aec) / PART_LEN - aecpc->bufSizeStart;
+      if (overhead_elements == 0) {
+        // Enable the AEC
+        aecpc->startup_phase = 0;
+      } else if (overhead_elements > 0) {
+        // TODO(bjornv): Do we need a check on how much we actually
+        // moved the read pointer? It should always be possible to move
+        // the pointer |overhead_elements| since we have only added data
+        // to the buffer and no delay compensation nor AEC processing
+        // has been done.
+        WebRtcAec_MoveFarReadPtr(aecpc->aec, overhead_elements);
+
+        // Enable the AEC
+        aecpc->startup_phase = 0;
+      }
+    }
+  } else {
+    // AEC is enabled.
+    EstBufDelayNormal(aecpc);
+
+    // Call the AEC.
+    // TODO(bjornv): Re-structure such that we don't have to pass
+    // |aecpc->knownDelay| as input. Change name to something like
+    // |system_buffer_diff|.
+    WebRtcAec_ProcessFrames(aecpc->aec,
+                            nearend,
+                            num_bands,
+                            nrOfSamples,
+                            aecpc->knownDelay,
+                            out);
+  }
+
+  return retVal;
+}
+
+static void ProcessExtended(Aec* self,
+                            const float* const* near,
+                            size_t num_bands,
+                            float* const* out,
+                            size_t num_samples,
+                            int16_t reported_delay_ms,
+                            int32_t skew) {
+  size_t i;
+  const int delay_diff_offset = kDelayDiffOffsetSamples;
+#if defined(WEBRTC_UNTRUSTED_DELAY)
+  reported_delay_ms = kFixedDelayMs;
+#else
+  // This is the usual mode where we trust the reported system delay values.
+  // Due to the longer filter, we no longer add 10 ms to the reported delay
+  // to reduce chance of non-causality. Instead we apply a minimum here to avoid
+  // issues with the read pointer jumping around needlessly.
+  reported_delay_ms = reported_delay_ms < kMinTrustedDelayMs
+                          ? kMinTrustedDelayMs
+                          : reported_delay_ms;
+  // If the reported delay appears to be bogus, we attempt to recover by using
+  // the measured fixed delay values. We use >= here because higher layers
+  // may already clamp to this maximum value, and we would otherwise not
+  // detect it here.
+  reported_delay_ms = reported_delay_ms >= kMaxTrustedDelayMs
+                          ? kFixedDelayMs
+                          : reported_delay_ms;
+#endif
+  self->msInSndCardBuf = reported_delay_ms;
+
+  if (!self->farend_started) {
+    for (i = 0; i < num_bands; ++i) {
+      // Only needed if they don't already point to the same place.
+      if (near[i] != out[i]) {
+        memcpy(out[i], near[i], sizeof(near[i][0]) * num_samples);
+      }
+    }
+    return;
+  }
+  if (self->startup_phase) {
+    // In the extended mode, there isn't a startup "phase", just a special
+    // action on the first frame. In the trusted delay case, we'll take the
+    // current reported delay, unless it's less then our conservative
+    // measurement.
+    int startup_size_ms =
+        reported_delay_ms < kFixedDelayMs ? kFixedDelayMs : reported_delay_ms;
+#if defined(WEBRTC_ANDROID)
+    int target_delay = startup_size_ms * self->rate_factor * 8;
+#else
+    // To avoid putting the AEC in a non-causal state we're being slightly
+    // conservative and scale by 2. On Android we use a fixed delay and
+    // therefore there is no need to scale the target_delay.
+    int target_delay = startup_size_ms * self->rate_factor * 8 / 2;
+#endif
+    int overhead_elements =
+        (WebRtcAec_system_delay(self->aec) - target_delay) / PART_LEN;
+    WebRtcAec_MoveFarReadPtr(self->aec, overhead_elements);
+    self->startup_phase = 0;
+  }
+
+  EstBufDelayExtended(self);
+
+  {
+    // |delay_diff_offset| gives us the option to manually rewind the delay on
+    // very low delay platforms which can't be expressed purely through
+    // |reported_delay_ms|.
+    const int adjusted_known_delay =
+        WEBRTC_SPL_MAX(0, self->knownDelay + delay_diff_offset);
+
+    WebRtcAec_ProcessFrames(self->aec,
+                            near,
+                            num_bands,
+                            num_samples,
+                            adjusted_known_delay,
+                            out);
+  }
+}
+
+static void EstBufDelayNormal(Aec* aecpc) {
+  int nSampSndCard = aecpc->msInSndCardBuf * sampMsNb * aecpc->rate_factor;
+  int current_delay = nSampSndCard - WebRtcAec_system_delay(aecpc->aec);
+  int delay_difference = 0;
+
+  // Before we proceed with the delay estimate filtering we:
+  // 1) Compensate for the frame that will be read.
+  // 2) Compensate for drift resampling.
+  // 3) Compensate for non-causality if needed, since the estimated delay can't
+  //    be negative.
+
+  // 1) Compensating for the frame(s) that will be read/processed.
+  current_delay += FRAME_LEN * aecpc->rate_factor;
+
+  // 2) Account for resampling frame delay.
+  if (aecpc->skewMode == kAecTrue && aecpc->resample == kAecTrue) {
+    current_delay -= kResamplingDelay;
+  }
+
+  // 3) Compensate for non-causality, if needed, by flushing one block.
+  if (current_delay < PART_LEN) {
+    current_delay += WebRtcAec_MoveFarReadPtr(aecpc->aec, 1) * PART_LEN;
+  }
+
+  // We use -1 to signal an initialized state in the "extended" implementation;
+  // compensate for that.
+  aecpc->filtDelay = aecpc->filtDelay < 0 ? 0 : aecpc->filtDelay;
+  aecpc->filtDelay =
+      WEBRTC_SPL_MAX(0, (short)(0.8 * aecpc->filtDelay + 0.2 * current_delay));
+
+  delay_difference = aecpc->filtDelay - aecpc->knownDelay;
+  if (delay_difference > 224) {
+    if (aecpc->lastDelayDiff < 96) {
+      aecpc->timeForDelayChange = 0;
+    } else {
+      aecpc->timeForDelayChange++;
+    }
+  } else if (delay_difference < 96 && aecpc->knownDelay > 0) {
+    if (aecpc->lastDelayDiff > 224) {
+      aecpc->timeForDelayChange = 0;
+    } else {
+      aecpc->timeForDelayChange++;
+    }
+  } else {
+    aecpc->timeForDelayChange = 0;
+  }
+  aecpc->lastDelayDiff = delay_difference;
+
+  if (aecpc->timeForDelayChange > 25) {
+    aecpc->knownDelay = WEBRTC_SPL_MAX((int)aecpc->filtDelay - 160, 0);
+  }
+}
+
+static void EstBufDelayExtended(Aec* self) {
+  int reported_delay = self->msInSndCardBuf * sampMsNb * self->rate_factor;
+  int current_delay = reported_delay - WebRtcAec_system_delay(self->aec);
+  int delay_difference = 0;
+
+  // Before we proceed with the delay estimate filtering we:
+  // 1) Compensate for the frame that will be read.
+  // 2) Compensate for drift resampling.
+  // 3) Compensate for non-causality if needed, since the estimated delay can't
+  //    be negative.
+
+  // 1) Compensating for the frame(s) that will be read/processed.
+  current_delay += FRAME_LEN * self->rate_factor;
+
+  // 2) Account for resampling frame delay.
+  if (self->skewMode == kAecTrue && self->resample == kAecTrue) {
+    current_delay -= kResamplingDelay;
+  }
+
+  // 3) Compensate for non-causality, if needed, by flushing two blocks.
+  if (current_delay < PART_LEN) {
+    current_delay += WebRtcAec_MoveFarReadPtr(self->aec, 2) * PART_LEN;
+  }
+
+  if (self->filtDelay == -1) {
+    self->filtDelay = WEBRTC_SPL_MAX(0, 0.5 * current_delay);
+  } else {
+    self->filtDelay = WEBRTC_SPL_MAX(
+        0, (short)(0.95 * self->filtDelay + 0.05 * current_delay));
+  }
+
+  delay_difference = self->filtDelay - self->knownDelay;
+  if (delay_difference > 384) {
+    if (self->lastDelayDiff < 128) {
+      self->timeForDelayChange = 0;
+    } else {
+      self->timeForDelayChange++;
+    }
+  } else if (delay_difference < 128 && self->knownDelay > 0) {
+    if (self->lastDelayDiff > 384) {
+      self->timeForDelayChange = 0;
+    } else {
+      self->timeForDelayChange++;
+    }
+  } else {
+    self->timeForDelayChange = 0;
+  }
+  self->lastDelayDiff = delay_difference;
+
+  if (self->timeForDelayChange > 25) {
+    self->knownDelay = WEBRTC_SPL_MAX((int)self->filtDelay - 256, 0);
+  }
+}
diff --git a/webrtc/modules/audio_processing/aec/echo_cancellation_internal.h b/webrtc/modules/audio_processing/aec/echo_cancellation_internal.h
new file mode 100644
index 0000000000..95a6cf3324
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/echo_cancellation_internal.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
+
+#include "webrtc/common_audio/ring_buffer.h"
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+
+typedef struct {
+  int delayCtr;
+  int sampFreq;
+  int splitSampFreq;
+  int scSampFreq;
+  float sampFactor;  // scSampRate / sampFreq
+  short skewMode;
+  int bufSizeStart;
+  int knownDelay;
+  int rate_factor;
+
+  short initFlag;  // indicates if AEC has been initialized
+
+  // Variables used for averaging far end buffer size
+  short counter;
+  int sum;
+  short firstVal;
+  short checkBufSizeCtr;
+
+  // Variables used for delay shifts
+  short msInSndCardBuf;
+  short filtDelay;  // Filtered delay estimate.
+  int timeForDelayChange;
+  int startup_phase;
+  int checkBuffSize;
+  short lastDelayDiff;
+
+#ifdef WEBRTC_AEC_DEBUG_DUMP
+  FILE* bufFile;
+  FILE* delayFile;
+  FILE* skewFile;
+#endif
+
+  // Structures
+  void* resampler;
+
+  int skewFrCtr;
+  int resample;  // if the skew is small enough we don't resample
+  int highSkewCtr;
+  float skew;
+
+  RingBuffer* far_pre_buf;  // Time domain far-end pre-buffer.
+
+  int lastError;
+
+  int farend_started;
+
+  AecCore* aec;
+} Aec;
+
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
diff --git a/webrtc/modules/audio_processing/aec/echo_cancellation_unittest.cc b/webrtc/modules/audio_processing/aec/echo_cancellation_unittest.cc
new file mode 100644
index 0000000000..315ac3e9f9
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/echo_cancellation_unittest.cc
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// TODO(bjornv): Make this a comprehensive test.
+
+#include "webrtc/modules/audio_processing/aec/include/echo_cancellation.h"
+
+#include <stdlib.h>
+#include <time.h>
+
+extern "C" {
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+}
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "webrtc/base/checks.h"
+
+namespace webrtc {
+
+TEST(EchoCancellationTest, CreateAndFreeHasExpectedBehavior) {
+  void* handle = WebRtcAec_Create();
+  ASSERT_TRUE(handle);
+  WebRtcAec_Free(nullptr);
+  WebRtcAec_Free(handle);
+}
+
+TEST(EchoCancellationTest, ApplyAecCoreHandle) {
+  void* handle = WebRtcAec_Create();
+  ASSERT_TRUE(handle);
+  EXPECT_TRUE(WebRtcAec_aec_core(NULL) == NULL);
+  AecCore* aec_core = WebRtcAec_aec_core(handle);
+  EXPECT_TRUE(aec_core != NULL);
+  // A simple test to verify that we can set and get a value from the lower
+  // level |aec_core| handle.
+  int delay = 111;
+  WebRtcAec_SetSystemDelay(aec_core, delay);
+  EXPECT_EQ(delay, WebRtcAec_system_delay(aec_core));
+  WebRtcAec_Free(handle);
+}
+
+}  // namespace webrtc
diff --git a/webrtc/modules/audio_processing/aec/include/echo_cancellation.h b/webrtc/modules/audio_processing/aec/include/echo_cancellation.h
new file mode 100644
index 0000000000..a340cf84d0
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/include/echo_cancellation.h
@@ -0,0 +1,245 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
+#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
+
+#include <stddef.h>
+
+#include "webrtc/typedefs.h"
+
+// Errors
+#define AEC_UNSPECIFIED_ERROR 12000
+#define AEC_UNSUPPORTED_FUNCTION_ERROR 12001
+#define AEC_UNINITIALIZED_ERROR 12002
+#define AEC_NULL_POINTER_ERROR 12003
+#define AEC_BAD_PARAMETER_ERROR 12004
+
+// Warnings
+#define AEC_BAD_PARAMETER_WARNING 12050
+
+enum {
+  kAecNlpConservative = 0,
+  kAecNlpModerate,
+  kAecNlpAggressive
+};
+
+enum {
+  kAecFalse = 0,
+  kAecTrue
+};
+
+typedef struct {
+  int16_t nlpMode;      // default kAecNlpModerate
+  int16_t skewMode;     // default kAecFalse
+  int16_t metricsMode;  // default kAecFalse
+  int delay_logging;    // default kAecFalse
+  // float realSkew;
+} AecConfig;
+
+typedef struct {
+  int instant;
+  int average;
+  int max;
+  int min;
+} AecLevel;
+
+typedef struct {
+  AecLevel rerl;
+  AecLevel erl;
+  AecLevel erle;
+  AecLevel aNlp;
+} AecMetrics;
+
+struct AecCore;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Allocates the memory needed by the AEC. The memory needs to be initialized
+ * separately using the WebRtcAec_Init() function. Returns a pointer to the
+ * object or NULL on error.
+ */
+void* WebRtcAec_Create();
+
+/*
+ * This function releases the memory allocated by WebRtcAec_Create().
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*        aecInst         Pointer to the AEC instance
+ */
+void WebRtcAec_Free(void* aecInst);
+
+/*
+ * Initializes an AEC instance.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          aecInst       Pointer to the AEC instance
+ * int32_t        sampFreq      Sampling frequency of data
+ * int32_t        scSampFreq    Soundcard sampling frequency
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int32_t        return        0: OK
+ *                             -1: error
+ */
+int32_t WebRtcAec_Init(void* aecInst, int32_t sampFreq, int32_t scSampFreq);
+
+/*
+ * Inserts an 80 or 160 sample block of data into the farend buffer.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          aecInst       Pointer to the AEC instance
+ * const float*   farend        In buffer containing one frame of
+ *                              farend signal for L band
+ * int16_t        nrOfSamples   Number of samples in farend buffer
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int32_t        return        0: OK
+ *                             -1: error
+ */
+int32_t WebRtcAec_BufferFarend(void* aecInst,
+                               const float* farend,
+                               size_t nrOfSamples);
+
+/*
+ * Runs the echo canceller on an 80 or 160 sample blocks of data.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*         aecInst        Pointer to the AEC instance
+ * float* const* nearend        In buffer containing one frame of
+ *                              nearend+echo signal for each band
+ * int           num_bands      Number of bands in nearend buffer
+ * int16_t       nrOfSamples    Number of samples in nearend buffer
+ * int16_t       msInSndCardBuf Delay estimate for sound card and
+ *                              system buffers
+ * int16_t       skew           Difference between number of samples played
+ *                              and recorded at the soundcard (for clock skew
+ *                              compensation)
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * float* const* out            Out buffer, one frame of processed nearend
+ *                              for each band
+ * int32_t       return         0: OK
+ *                             -1: error
+ */
+int32_t WebRtcAec_Process(void* aecInst,
+                          const float* const* nearend,
+                          size_t num_bands,
+                          float* const* out,
+                          size_t nrOfSamples,
+                          int16_t msInSndCardBuf,
+                          int32_t skew);
+
+/*
+ * This function enables the user to set certain parameters on-the-fly.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          handle        Pointer to the AEC instance
+ * AecConfig      config        Config instance that contains all
+ *                              properties to be set
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int            return         0: OK
+ *                              -1: error
+ */
+int WebRtcAec_set_config(void* handle, AecConfig config);
+
+/*
+ * Gets the current echo status of the nearend signal.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          handle        Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int*           status        0: Almost certainly nearend single-talk
+ *                              1: Might not be neared single-talk
+ * int            return         0: OK
+ *                              -1: error
+ */
+int WebRtcAec_get_echo_status(void* handle, int* status);
+
+/*
+ * Gets the current echo metrics for the session.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          handle        Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * AecMetrics*    metrics       Struct which will be filled out with the
+ *                              current echo metrics.
+ * int            return         0: OK
+ *                              -1: error
+ */
+int WebRtcAec_GetMetrics(void* handle, AecMetrics* metrics);
+
+/*
+ * Gets the current delay metrics for the session.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*   handle               Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int*    median               Delay median value.
+ * int*    std                  Delay standard deviation.
+ * float*  fraction_poor_delays Fraction of the delay estimates that may
+ *                              cause the AEC to perform poorly.
+ *
+ * int     return                0: OK
+ *                              -1: error
+ */
+int WebRtcAec_GetDelayMetrics(void* handle,
+                              int* median,
+                              int* std,
+                              float* fraction_poor_delays);
+
+/*
+ * Gets the last error code.
+ *
+ * Inputs                       Description
+ * -------------------------------------------------------------------
+ * void*          aecInst       Pointer to the AEC instance
+ *
+ * Outputs                      Description
+ * -------------------------------------------------------------------
+ * int32_t        return        11000-11100: error code
+ */
+int32_t WebRtcAec_get_error_code(void* aecInst);
+
+// Returns a pointer to the low level AEC handle.
+//
+// Input:
+//  - handle                    : Pointer to the AEC instance.
+//
+// Return value:
+//  - AecCore pointer           : NULL for error.
+//
+struct AecCore* WebRtcAec_aec_core(void* handle);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
diff --git a/webrtc/modules/audio_processing/aec/system_delay_unittest.cc b/webrtc/modules/audio_processing/aec/system_delay_unittest.cc
new file mode 100644
index 0000000000..07e3cf8add
--- /dev/null
+++ b/webrtc/modules/audio_processing/aec/system_delay_unittest.cc
@@ -0,0 +1,602 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "testing/gtest/include/gtest/gtest.h"
+extern "C" {
+#include "webrtc/modules/audio_processing/aec/aec_core.h"
+}
+#include "webrtc/modules/audio_processing/aec/echo_cancellation_internal.h"
+#include "webrtc/modules/audio_processing/aec/include/echo_cancellation.h"
+#include "webrtc/test/testsupport/gtest_disable.h"
+#include "webrtc/typedefs.h"
+
+namespace {
+
+class SystemDelayTest : public ::testing::Test {
+ protected:
+  SystemDelayTest();
+  virtual void SetUp();
+  virtual void TearDown();
+
+  // Initialization of AEC handle with respect to |sample_rate_hz|. Since the
+  // device sample rate is unimportant we set that value to 48000 Hz.
+  void Init(int sample_rate_hz);
+
+  // Makes one render call and one capture call in that specific order.
+  void RenderAndCapture(int device_buffer_ms);
+
+  // Fills up the far-end buffer with respect to the default device buffer size.
+  size_t BufferFillUp();
+
+  // Runs and verifies the behavior in a stable startup procedure.
+  void RunStableStartup();
+
+  // Maps buffer size in ms into samples, taking the unprocessed frame into
+  // account.
+  int MapBufferSizeToSamples(int size_in_ms, bool extended_filter);
+
+  void* handle_;
+  Aec* self_;
+  size_t samples_per_frame_;
+  // Dummy input/output speech data.
+  static const int kSamplesPerChunk = 160;
+  float far_[kSamplesPerChunk];
+  float near_[kSamplesPerChunk];
+  float out_[kSamplesPerChunk];
+  const float* near_ptr_;
+  float* out_ptr_;
+};
+
+SystemDelayTest::SystemDelayTest()
+    : handle_(NULL), self_(NULL), samples_per_frame_(0) {
+  // Dummy input data are set with more or less arbitrary non-zero values.
+  for (int i = 0; i < kSamplesPerChunk; i++) {
+    far_[i] = 257.0;
+    near_[i] = 514.0;
+  }
+  memset(out_, 0, sizeof(out_));
+  near_ptr_ = near_;
+  out_ptr_ = out_;
+}
+
+void SystemDelayTest::SetUp() {
+  handle_ = WebRtcAec_Create();
+  ASSERT_TRUE(handle_);
+  self_ = reinterpret_cast<Aec*>(handle_);
+}
+
+void SystemDelayTest::TearDown() {
+  // Free AEC
+  WebRtcAec_Free(handle_);
+  handle_ = NULL;
+}
+
+// In SWB mode nothing is added to the buffer handling with respect to
+// functionality compared to WB. We therefore only verify behavior in NB and WB.
+static const int kSampleRateHz[] = {8000, 16000};
+static const size_t kNumSampleRates =
+    sizeof(kSampleRateHz) / sizeof(*kSampleRateHz);
+
+// Default audio device buffer size used.
+static const int kDeviceBufMs = 100;
+
+// Requirement for a stable device convergence time in ms. Should converge in
+// less than |kStableConvergenceMs|.
+static const int kStableConvergenceMs = 100;
+
+// Maximum convergence time in ms. This means that we should leave the startup
+// phase after |kMaxConvergenceMs| independent of device buffer stability
+// conditions.
+static const int kMaxConvergenceMs = 500;
+
+void SystemDelayTest::Init(int sample_rate_hz) {
+  // Initialize AEC
+  EXPECT_EQ(0, WebRtcAec_Init(handle_, sample_rate_hz, 48000));
+  EXPECT_EQ(0, WebRtcAec_system_delay(self_->aec));
+
+  // One frame equals 10 ms of data.
+  samples_per_frame_ = static_cast<size_t>(sample_rate_hz / 100);
+}
+
+void SystemDelayTest::RenderAndCapture(int device_buffer_ms) {
+  EXPECT_EQ(0, WebRtcAec_BufferFarend(handle_, far_, samples_per_frame_));
+  EXPECT_EQ(0,
+            WebRtcAec_Process(handle_,
+                              &near_ptr_,
+                              1,
+                              &out_ptr_,
+                              samples_per_frame_,
+                              device_buffer_ms,
+                              0));
+}
+
+size_t SystemDelayTest::BufferFillUp() {
+  // To make sure we have a full buffer when we verify stability we first fill
+  // up the far-end buffer with the same amount as we will report in through
+  // Process().
+  size_t buffer_size = 0;
+  for (int i = 0; i < kDeviceBufMs / 10; i++) {
+    EXPECT_EQ(0, WebRtcAec_BufferFarend(handle_, far_, samples_per_frame_));
+    buffer_size += samples_per_frame_;
+    EXPECT_EQ(static_cast<int>(buffer_size),
+              WebRtcAec_system_delay(self_->aec));
+  }
+  return buffer_size;
+}
+
+void SystemDelayTest::RunStableStartup() {
+  // To make sure we have a full buffer when we verify stability we first fill
+  // up the far-end buffer with the same amount as we will report in through
+  // Process().
+  size_t buffer_size = BufferFillUp();
+
+  if (WebRtcAec_delay_agnostic_enabled(self_->aec) == 1) {
+    // In extended_filter mode we set the buffer size after the first processed
+    // 10 ms chunk. Hence, we don't need to wait for the reported system delay
+    // values to become stable.
+    RenderAndCapture(kDeviceBufMs);
+    buffer_size += samples_per_frame_;
+    EXPECT_EQ(0, self_->startup_phase);
+  } else {
+    // A stable device should be accepted and put in a regular process mode
+    // within |kStableConvergenceMs|.
+    int process_time_ms = 0;
+    for (; process_time_ms < kStableConvergenceMs; process_time_ms += 10) {
+      RenderAndCapture(kDeviceBufMs);
+      buffer_size += samples_per_frame_;
+      if (self_->startup_phase == 0) {
+        // We have left the startup phase.
+        break;
+      }
+    }
+    // Verify convergence time.
+    EXPECT_GT(kStableConvergenceMs, process_time_ms);
+  }
+  // Verify that the buffer has been flushed.
+  EXPECT_GE(static_cast<int>(buffer_size),
+            WebRtcAec_system_delay(self_->aec));
+}
+
+  int SystemDelayTest::MapBufferSizeToSamples(int size_in_ms,
+                                              bool extended_filter) {
+  // If extended_filter is disabled we add an extra 10 ms for the unprocessed
+  // frame. That is simply how the algorithm is constructed.
+  return static_cast<int>(
+      (size_in_ms + (extended_filter ? 0 : 10)) * samples_per_frame_ / 10);
+}
+
+// The tests should meet basic requirements and not be adjusted to what is
+// actually implemented. If we don't get good code coverage this way we either
+// lack in tests or have unnecessary code.
+// General requirements:
+// 1) If we add far-end data the system delay should be increased with the same
+//    amount we add.
+// 2) If the far-end buffer is full we should flush the oldest data to make room
+//    for the new. In this case the system delay is unaffected.
+// 3) There should exist a startup phase in which the buffer size is to be
+//    determined. In this phase no cancellation should be performed.
+// 4) Under stable conditions (small variations in device buffer sizes) the AEC
+//    should determine an appropriate local buffer size within
+//    |kStableConvergenceMs| ms.
+// 5) Under unstable conditions the AEC should make a decision within
+//    |kMaxConvergenceMs| ms.
+// 6) If the local buffer runs out of data we should stuff the buffer with older
+//    frames.
+// 7) The system delay should within |kMaxConvergenceMs| ms heal from
+//    disturbances like drift, data glitches, toggling events and outliers.
+// 8) The system delay should never become negative.
+
+TEST_F(SystemDelayTest, CorrectIncreaseWhenBufferFarend) {
+  // When we add data to the AEC buffer the internal system delay should be
+  // incremented with the same amount as the size of data.
+  // This process should be independent of DA-AEC and extended_filter mode.
+  for (int extended_filter = 0; extended_filter <= 1; ++extended_filter) {
+    WebRtcAec_enable_extended_filter(self_->aec, extended_filter);
+    EXPECT_EQ(extended_filter, WebRtcAec_extended_filter_enabled(self_->aec));
+    for (int da_aec = 0; da_aec <= 1; ++da_aec) {
+      WebRtcAec_enable_delay_agnostic(self_->aec, da_aec);
+      EXPECT_EQ(da_aec, WebRtcAec_delay_agnostic_enabled(self_->aec));
+      for (size_t i = 0; i < kNumSampleRates; i++) {
+        Init(kSampleRateHz[i]);
+        // Loop through a couple of calls to make sure the system delay
+        // increments correctly.
+        for (int j = 1; j <= 5; j++) {
+          EXPECT_EQ(0,
+                    WebRtcAec_BufferFarend(handle_, far_, samples_per_frame_));
+          EXPECT_EQ(static_cast<int>(j * samples_per_frame_),
+                    WebRtcAec_system_delay(self_->aec));
+        }
+      }
+    }
+  }
+}
+
+// TODO(bjornv): Add a test to verify behavior if the far-end buffer is full
+// when adding new data.
+
+TEST_F(SystemDelayTest, CorrectDelayAfterStableStartup) {
+  // We run the system in a stable startup. After that we verify that the system
+  // delay meets the requirements.
+  // This process should be independent of DA-AEC and extended_filter mode.
+  for (int extended_filter = 0; extended_filter <= 1; ++extended_filter) {
+    WebRtcAec_enable_extended_filter(self_->aec, extended_filter);
+    EXPECT_EQ(extended_filter, WebRtcAec_extended_filter_enabled(self_->aec));
+    for (int da_aec = 0; da_aec <= 1; ++da_aec) {
+      WebRtcAec_enable_delay_agnostic(self_->aec, da_aec);
+      EXPECT_EQ(da_aec, WebRtcAec_delay_agnostic_enabled(self_->aec));
+      for (size_t i = 0; i < kNumSampleRates; i++) {
+        Init(kSampleRateHz[i]);
+        RunStableStartup();
+
+        // Verify system delay with respect to requirements, i.e., the
+        // |system_delay| is in the interval [75%, 100%] of what's reported on
+        // the average.
+        // In extended_filter mode we target 50% and measure after one processed
+        // 10 ms chunk.
+        int average_reported_delay =
+            static_cast<int>(kDeviceBufMs * samples_per_frame_ / 10);
+        EXPECT_GE(average_reported_delay, WebRtcAec_system_delay(self_->aec));
+        int lower_bound = WebRtcAec_extended_filter_enabled(self_->aec)
+                              ? average_reported_delay / 2 - samples_per_frame_
+                              : average_reported_delay * 3 / 4;
+        EXPECT_LE(lower_bound, WebRtcAec_system_delay(self_->aec));
+      }
+    }
+  }
+}
+
+TEST_F(SystemDelayTest, CorrectDelayAfterUnstableStartup) {
+  // This test does not apply in extended_filter mode, since we only use the
+  // the first 10 ms chunk to determine a reasonable buffer size. Neither does
+  // it apply if DA-AEC is on because that overrides the startup procedure.
+  WebRtcAec_enable_extended_filter(self_->aec, 0);
+  EXPECT_EQ(0, WebRtcAec_extended_filter_enabled(self_->aec));
+  WebRtcAec_enable_delay_agnostic(self_->aec, 0);
+  EXPECT_EQ(0, WebRtcAec_delay_agnostic_enabled(self_->aec));
+
+  // In an unstable system we would start processing after |kMaxConvergenceMs|.
+  // On the last frame the AEC buffer is adjusted to 60% of the last reported
+  // device buffer size.
+  // We construct an unstable system by altering the device buffer size between
+  // two values |kDeviceBufMs| +- 25 ms.
+  for (size_t i = 0; i < kNumSampleRates; i++) {
+    Init(kSampleRateHz[i]);
+
+    // To make sure we have a full buffer when we verify stability we first fill
+    // up the far-end buffer with the same amount as we will report in on the
+    // average through Process().
+    size_t buffer_size = BufferFillUp();
+
+    int buffer_offset_ms = 25;
+    int reported_delay_ms = 0;
+    int process_time_ms = 0;
+    for (; process_time_ms <= kMaxConvergenceMs; process_time_ms += 10) {
+      reported_delay_ms = kDeviceBufMs + buffer_offset_ms;
+      RenderAndCapture(reported_delay_ms);
+      buffer_size += samples_per_frame_;
+      buffer_offset_ms = -buffer_offset_ms;
+      if (self_->startup_phase == 0) {
+        // We have left the startup phase.
+        break;
+      }
+    }
+    // Verify convergence time.
+    EXPECT_GE(kMaxConvergenceMs, process_time_ms);
+    // Verify that the buffer has been flushed.
+    EXPECT_GE(static_cast<int>(buffer_size),
+              WebRtcAec_system_delay(self_->aec));
+
+    // Verify system delay with respect to requirements, i.e., the
+    // |system_delay| is in the interval [60%, 100%] of what's last reported.
+    EXPECT_GE(static_cast<int>(reported_delay_ms * samples_per_frame_ / 10),
+              WebRtcAec_system_delay(self_->aec));
+    EXPECT_LE(
+        static_cast<int>(reported_delay_ms * samples_per_frame_ / 10 * 3 / 5),
+        WebRtcAec_system_delay(self_->aec));
+  }
+}
+
+TEST_F(SystemDelayTest, CorrectDelayAfterStableBufferBuildUp) {
+  // This test does not apply in extended_filter mode, since we only use the
+  // the first 10 ms chunk to determine a reasonable buffer size. Neither does
+  // it apply if DA-AEC is on because that overrides the startup procedure.
+  WebRtcAec_enable_extended_filter(self_->aec, 0);
+  EXPECT_EQ(0, WebRtcAec_extended_filter_enabled(self_->aec));
+  WebRtcAec_enable_delay_agnostic(self_->aec, 0);
+  EXPECT_EQ(0, WebRtcAec_delay_agnostic_enabled(self_->aec));
+
+  // In this test we start by establishing the device buffer size during stable
+  // conditions, but with an empty internal far-end buffer. Once that is done we
+  // verify that the system delay is increased correctly until we have reach an
+  // internal buffer size of 75% of what's been reported.
+  for (size_t i = 0; i < kNumSampleRates; i++) {
+    Init(kSampleRateHz[i]);
+
+    // We assume that running |kStableConvergenceMs| calls will put the
+    // algorithm in a state where the device buffer size has been determined. We
+    // can make that assumption since we have a separate stability test.
+    int process_time_ms = 0;
+    for (; process_time_ms < kStableConvergenceMs; process_time_ms += 10) {
+      EXPECT_EQ(0,
+                WebRtcAec_Process(handle_,
+                                  &near_ptr_,
+                                  1,
+                                  &out_ptr_,
+                                  samples_per_frame_,
+                                  kDeviceBufMs,
+                                  0));
+    }
+    // Verify that a buffer size has been established.
+    EXPECT_EQ(0, self_->checkBuffSize);
+
+    // We now have established the required buffer size. Let us verify that we
+    // fill up before leaving the startup phase for normal processing.
+    size_t buffer_size = 0;
+    size_t target_buffer_size = kDeviceBufMs * samples_per_frame_ / 10 * 3 / 4;
+    process_time_ms = 0;
+    for (; process_time_ms <= kMaxConvergenceMs; process_time_ms += 10) {
+      RenderAndCapture(kDeviceBufMs);
+      buffer_size += samples_per_frame_;
+      if (self_->startup_phase == 0) {
+        // We have left the startup phase.
+        break;
+      }
+    }
+    // Verify convergence time.
+    EXPECT_GT(kMaxConvergenceMs, process_time_ms);
+    // Verify that the buffer has reached the desired size.
+    EXPECT_LE(static_cast<int>(target_buffer_size),
+              WebRtcAec_system_delay(self_->aec));
+
+    // Verify normal behavior (system delay is kept constant) after startup by
+    // running a couple of calls to BufferFarend() and Process().
+    for (int j = 0; j < 6; j++) {
+      int system_delay_before_calls = WebRtcAec_system_delay(self_->aec);
+      RenderAndCapture(kDeviceBufMs);
+      EXPECT_EQ(system_delay_before_calls, WebRtcAec_system_delay(self_->aec));
+    }
+  }
+}
+
+TEST_F(SystemDelayTest, CorrectDelayWhenBufferUnderrun) {
+  // Here we test a buffer under run scenario. If we keep on calling
+  // WebRtcAec_Process() we will finally run out of data, but should
+  // automatically stuff the buffer. We verify this behavior by checking if the
+  // system delay goes negative.
+  // This process should be independent of DA-AEC and extended_filter mode.
+  for (int extended_filter = 0; extended_filter <= 1; ++extended_filter) {
+    WebRtcAec_enable_extended_filter(self_->aec, extended_filter);
+    EXPECT_EQ(extended_filter, WebRtcAec_extended_filter_enabled(self_->aec));
+    for (int da_aec = 0; da_aec <= 1; ++da_aec) {
+      WebRtcAec_enable_delay_agnostic(self_->aec, da_aec);
+      EXPECT_EQ(da_aec, WebRtcAec_delay_agnostic_enabled(self_->aec));
+      for (size_t i = 0; i < kNumSampleRates; i++) {
+        Init(kSampleRateHz[i]);
+        RunStableStartup();
+
+        // The AEC has now left the Startup phase. We now have at most
+        // |kStableConvergenceMs| in the buffer. Keep on calling Process() until
+        // we run out of data and verify that the system delay is non-negative.
+        for (int j = 0; j <= kStableConvergenceMs; j += 10) {
+          EXPECT_EQ(0, WebRtcAec_Process(handle_, &near_ptr_, 1, &out_ptr_,
+                                         samples_per_frame_, kDeviceBufMs, 0));
+          EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
+        }
+      }
+    }
+  }
+}
+
+TEST_F(SystemDelayTest, CorrectDelayDuringDrift) {
+  // This drift test should verify that the system delay is never exceeding the
+  // device buffer. The drift is simulated by decreasing the reported device
+  // buffer size by 1 ms every 100 ms. If the device buffer size goes below 30
+  // ms we jump (add) 10 ms to give a repeated pattern.
+
+  // This process should be independent of DA-AEC and extended_filter mode.
+  for (int extended_filter = 0; extended_filter <= 1; ++extended_filter) {
+    WebRtcAec_enable_extended_filter(self_->aec, extended_filter);
+    EXPECT_EQ(extended_filter, WebRtcAec_extended_filter_enabled(self_->aec));
+    for (int da_aec = 0; da_aec <= 1; ++da_aec) {
+      WebRtcAec_enable_delay_agnostic(self_->aec, da_aec);
+      EXPECT_EQ(da_aec, WebRtcAec_delay_agnostic_enabled(self_->aec));
+      for (size_t i = 0; i < kNumSampleRates; i++) {
+        Init(kSampleRateHz[i]);
+        RunStableStartup();
+
+        // We have left the startup phase and proceed with normal processing.
+        int jump = 0;
+        for (int j = 0; j < 1000; j++) {
+          // Drift = -1 ms per 100 ms of data.
+          int device_buf_ms = kDeviceBufMs - (j / 10) + jump;
+          int device_buf = MapBufferSizeToSamples(device_buf_ms,
+                                                  extended_filter == 1);
+
+          if (device_buf_ms < 30) {
+            // Add 10 ms data, taking affect next frame.
+            jump += 10;
+          }
+          RenderAndCapture(device_buf_ms);
+
+          // Verify that the system delay does not exceed the device buffer.
+          EXPECT_GE(device_buf, WebRtcAec_system_delay(self_->aec));
+
+          // Verify that the system delay is non-negative.
+          EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
+        }
+      }
+    }
+  }
+}
+
+TEST_F(SystemDelayTest, ShouldRecoverAfterGlitch) {
+  // This glitch test should verify that the system delay recovers if there is
+  // a glitch in data. The data glitch is constructed as 200 ms of buffering
+  // after which the stable procedure continues. The glitch is never reported by
+  // the device.
+  // The system is said to be in a non-causal state if the difference between
+  // the device buffer and system delay is less than a block (64 samples).
+
+  // This process should be independent of DA-AEC and extended_filter mode.
+  for (int extended_filter = 0; extended_filter <= 1; ++extended_filter) {
+    WebRtcAec_enable_extended_filter(self_->aec, extended_filter);
+    EXPECT_EQ(extended_filter, WebRtcAec_extended_filter_enabled(self_->aec));
+    for (int da_aec = 0; da_aec <= 1; ++da_aec) {
+      WebRtcAec_enable_delay_agnostic(self_->aec, da_aec);
+      EXPECT_EQ(da_aec, WebRtcAec_delay_agnostic_enabled(self_->aec));
+      for (size_t i = 0; i < kNumSampleRates; i++) {
+        Init(kSampleRateHz[i]);
+        RunStableStartup();
+        int device_buf = MapBufferSizeToSamples(kDeviceBufMs,
+                                                extended_filter == 1);
+        // Glitch state.
+        for (int j = 0; j < 20; j++) {
+          EXPECT_EQ(0,
+                    WebRtcAec_BufferFarend(handle_, far_, samples_per_frame_));
+          // No need to verify system delay, since that is done in a separate
+          // test.
+        }
+        // Verify that we are in a non-causal state, i.e.,
+        // |system_delay| > |device_buf|.
+        EXPECT_LT(device_buf, WebRtcAec_system_delay(self_->aec));
+
+        // Recover state. Should recover at least 4 ms of data per 10 ms, hence
+        // a glitch of 200 ms will take at most 200 * 10 / 4 = 500 ms to recover
+        // from.
+        bool non_causal = true;  // We are currently in a non-causal state.
+        for (int j = 0; j < 50; j++) {
+          int system_delay_before = WebRtcAec_system_delay(self_->aec);
+          RenderAndCapture(kDeviceBufMs);
+          int system_delay_after = WebRtcAec_system_delay(self_->aec);
+          // We have recovered if
+          // |device_buf| - |system_delay_after| >= PART_LEN (1 block).
+          // During recovery, |system_delay_after| < |system_delay_before|,
+          // otherwise they are equal.
+          if (non_causal) {
+            EXPECT_LT(system_delay_after, system_delay_before);
+            if (device_buf - system_delay_after >= PART_LEN) {
+              non_causal = false;
+            }
+          } else {
+            EXPECT_EQ(system_delay_before, system_delay_after);
+          }
+          // Verify that the system delay is non-negative.
+          EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
+        }
+        // Check that we have recovered.
+        EXPECT_FALSE(non_causal);
+      }
+    }
+  }
+}
+
+TEST_F(SystemDelayTest, UnaffectedWhenSpuriousDeviceBufferValues) {
+  // This test does not apply in extended_filter mode, since we only use the
+  // the first 10 ms chunk to determine a reasonable buffer size.
+  const int extended_filter = 0;
+  WebRtcAec_enable_extended_filter(self_->aec, extended_filter);
+  EXPECT_EQ(extended_filter, WebRtcAec_extended_filter_enabled(self_->aec));
+
+  // Should be DA-AEC independent.
+  for (int da_aec = 0; da_aec <= 1; ++da_aec) {
+    WebRtcAec_enable_delay_agnostic(self_->aec, da_aec);
+    EXPECT_EQ(da_aec, WebRtcAec_delay_agnostic_enabled(self_->aec));
+    // This spurious device buffer data test aims at verifying that the system
+    // delay is unaffected by large outliers.
+    // The system is said to be in a non-causal state if the difference between
+    // the device buffer and system delay is less than a block (64 samples).
+    for (size_t i = 0; i < kNumSampleRates; i++) {
+      Init(kSampleRateHz[i]);
+      RunStableStartup();
+      int device_buf = MapBufferSizeToSamples(kDeviceBufMs,
+                                              extended_filter == 1);
+
+      // Normal state. We are currently not in a non-causal state.
+      bool non_causal = false;
+
+      // Run 1 s and replace device buffer size with 500 ms every 100 ms.
+      for (int j = 0; j < 100; j++) {
+        int system_delay_before_calls = WebRtcAec_system_delay(self_->aec);
+        int device_buf_ms = j % 10 == 0 ? 500 : kDeviceBufMs;
+        RenderAndCapture(device_buf_ms);
+
+        // Check for non-causality.
+        if (device_buf - WebRtcAec_system_delay(self_->aec) < PART_LEN) {
+          non_causal = true;
+        }
+        EXPECT_FALSE(non_causal);
+        EXPECT_EQ(system_delay_before_calls,
+                  WebRtcAec_system_delay(self_->aec));
+
+        // Verify that the system delay is non-negative.
+        EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
+      }
+    }
+  }
+}
+
+TEST_F(SystemDelayTest, CorrectImpactWhenTogglingDeviceBufferValues) {
+  // This test aims at verifying that the system delay is "unaffected" by
+  // toggling values reported by the device.
+  // The test is constructed such that every other device buffer value is zero
+  // and then 2 * |kDeviceBufMs|, hence the size is constant on the average. The
+  // zero values will force us into a non-causal state and thereby lowering the
+  // system delay until we basically run out of data. Once that happens the
+  // buffer will be stuffed.
+  // TODO(bjornv): This test will have a better impact if we verified that the
+  // delay estimate goes up when the system delay goes down to meet the average
+  // device buffer size.
+
+  // This test does not apply if DA-AEC is enabled and extended_filter mode
+  // disabled.
+  for (int extended_filter = 0; extended_filter <= 1; ++extended_filter) {
+    WebRtcAec_enable_extended_filter(self_->aec, extended_filter);
+    EXPECT_EQ(extended_filter, WebRtcAec_extended_filter_enabled(self_->aec));
+    for (int da_aec = 0; da_aec <= 1; ++da_aec) {
+      WebRtcAec_enable_delay_agnostic(self_->aec, da_aec);
+      EXPECT_EQ(da_aec, WebRtcAec_delay_agnostic_enabled(self_->aec));
+      if (extended_filter == 0 && da_aec == 1) {
+        continue;
+      }
+      for (size_t i = 0; i < kNumSampleRates; i++) {
+        Init(kSampleRateHz[i]);
+        RunStableStartup();
+        const int device_buf = MapBufferSizeToSamples(kDeviceBufMs,
+                                                      extended_filter == 1);
+
+        // Normal state. We are currently not in a non-causal state.
+        bool non_causal = false;
+
+        // Loop through 100 frames (both render and capture), which equals 1 s
+        // of data. Every odd frame we set the device buffer size to
+        // 2 * |kDeviceBufMs| and even frames we set the device buffer size to
+        // zero.
+        for (int j = 0; j < 100; j++) {
+          int system_delay_before_calls = WebRtcAec_system_delay(self_->aec);
+          int device_buf_ms = 2 * (j % 2) * kDeviceBufMs;
+          RenderAndCapture(device_buf_ms);
+
+          // Check for non-causality, compared with the average device buffer
+          // size.
+          non_causal |= (device_buf - WebRtcAec_system_delay(self_->aec) < 64);
+          EXPECT_GE(system_delay_before_calls,
+                    WebRtcAec_system_delay(self_->aec));
+
+          // Verify that the system delay is non-negative.
+          EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
+        }
+        // Verify we are not in a non-causal state.
+        EXPECT_FALSE(non_causal);
+      }
+    }
+  }
+}
+
+}  // namespace
author	Chih-hung Hsieh <chh@google.com>	2015-12-01 17:07:48 +0000
committer	android-build-merger <android-build-merger@google.com>	2015-12-01 17:07:48 +0000
commit	a4acd9d6bc9b3b033d7d274316e75ee067df8d20 (patch)
tree	672a185b294789cf991f385c3e395dd63bea9063 /webrtc/modules/audio_processing/aec
parent	3681b90ba4fe7a27232dd3e27897d5d7ed9d651c (diff)
parent	fe8b4a657979b49e1701bd92f6d5814a99e0b2be (diff)
download	webrtc-a4acd9d6bc9b3b033d7d274316e75ee067df8d20.tar.gz