From 3c3902f0ac13428394f14f78f0fab05ef3468d69 Mon Sep 17 00:00:00 2001
From: "tlegrand@google.com"
 <tlegrand@google.com@0039d316-1c4b-4281-b951-d872f2087c98>
Date: Mon, 9 Dec 2013 08:35:25 +0000
Subject: Updating Opus to release 1.1

opus-1.1.tar.gz downloaded from http://www.opus-codec.org/

R=sergeyu@chromium.org

Review URL: https://codereview.chromium.org/107243004

git-svn-id: svn://svn.chromium.org/chrome/trunk/deps/third_party/opus@239448 0039d316-1c4b-4281-b951-d872f2087c98
---
 celt/_kiss_fft_guts.h           |   4 +-
 celt/arch.h                     |   7 +-
 celt/arm/arm_celt_map.c         |  49 ++++
 celt/arm/armcpu.c               |  18 +-
 celt/arm/armcpu.h               |  42 +++-
 celt/arm/armopts.s.in           |  37 +++
 celt/arm/celt_pitch_xcorr_arm.s | 545 ++++++++++++++++++++++++++++++++++++++++
 celt/arm/fixed_armv4.h          |   4 +-
 celt/arm/fixed_armv5e.h         |  10 +-
 celt/arm/pitch_arm.h            |  57 +++++
 celt/bands.c                    |  28 ++-
 celt/celt.h                     |  15 +-
 celt/celt_decoder.c             |  18 +-
 celt/celt_encoder.c             | 102 +++++---
 celt/celt_lpc.c                 |   5 +-
 celt/celt_lpc.h                 |   3 +-
 celt/cpu_support.h              |   7 +-
 celt/cwrs.c                     |  12 +-
 celt/ecintrin.h                 |   2 +-
 celt/entcode.h                  |   9 +-
 celt/fixed_debug.h              |  68 ++---
 celt/fixed_generic.h            |   2 +-
 celt/float_cast.h               |   4 +-
 celt/mathops.c                  |   2 +-
 celt/mathops.h                  |  22 +-
 celt/os_support.h               |   9 +-
 celt/pitch.c                    |  15 +-
 celt/pitch.h                    |  38 ++-
 celt/quant_bands.c              |   6 +-
 celt/rate.c                     |   2 +-
 celt/rate.h                     |   6 +-
 celt/stack_alloc.h              |   9 +-
 celt/x86/pitch_sse.h            |   6 +-
 33 files changed, 999 insertions(+), 164 deletions(-)
 create mode 100644 celt/arm/arm_celt_map.c
 create mode 100644 celt/arm/armopts.s.in
 create mode 100644 celt/arm/celt_pitch_xcorr_arm.s
 create mode 100644 celt/arm/pitch_arm.h

(limited to 'celt')

diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h
index f0c6976..aefe490 100644
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -94,11 +94,11 @@
     do {(res).r = ADD32((res).r,(a).r);  (res).i = SUB32((res).i,(a).i); \
     }while(0)
 
-#if defined(ARMv4_ASM)
+#if defined(OPUS_ARM_INLINE_ASM)
 #include "arm/kiss_fft_armv4.h"
 #endif
 
-#if defined(ARMv5E_ASM)
+#if defined(OPUS_ARM_INLINE_EDSP)
 #include "arm/kiss_fft_armv5e.h"
 #endif
 
diff --git a/celt/arch.h b/celt/arch.h
index f9c9856..3bbcd36 100644
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -35,6 +35,7 @@
 #define ARCH_H
 
 #include "opus_types.h"
+#include "opus_defines.h"
 
 # if !defined(__GNUC_PREREQ)
 #  if defined(__GNUC__)&&defined(__GNUC_MINOR__)
@@ -54,7 +55,7 @@
 #ifdef __GNUC__
 __attribute__((noreturn))
 #endif
-static inline void _celt_fatal(const char *str, const char *file, int line)
+static OPUS_INLINE void _celt_fatal(const char *str, const char *file, int line)
 {
    fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str);
    abort();
@@ -113,9 +114,9 @@ typedef opus_val32 celt_ener;
 
 #include "fixed_generic.h"
 
-#ifdef ARMv5E_ASM
+#ifdef OPUS_ARM_INLINE_EDSP
 #include "arm/fixed_armv5e.h"
-#elif defined (ARMv4_ASM)
+#elif defined (OPUS_ARM_INLINE_ASM)
 #include "arm/fixed_armv4.h"
 #elif defined (BFIN_ASM)
 #include "fixed_bfin.h"
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
new file mode 100644
index 0000000..547a84d
--- /dev/null
+++ b/celt/arm/arm_celt_map.c
@@ -0,0 +1,49 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "pitch.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+# if defined(FIXED_POINT)
+opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+    const opus_val16 *, opus_val32 *, int , int) = {
+  celt_pitch_xcorr_c,               /* ARMv4 */
+  MAY_HAVE_EDSP(celt_pitch_xcorr),  /* EDSP */
+  MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
+  MAY_HAVE_NEON(celt_pitch_xcorr)   /* NEON */
+};
+# else
+#  error "Floating-point implementation is not supported by ARM asm yet." \
+ "Reconfigure with --disable-rtcd or send patches."
+# endif
+
+#endif
diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
index aabcc71..1768525 100644
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -49,13 +49,13 @@
 # define WIN32_EXTRA_LEAN
 # include <windows.h>
 
-static inline opus_uint32 opus_cpu_capabilities(void){
+static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
   opus_uint32 flags;
   flags=0;
-  /* MSVC has no inline __asm support for ARM, but it does let you __emit
+  /* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit
    * instructions via their assembled hex code.
    * All of these instructions should be essentially nops. */
-# if defined(ARMv5E_ASM)
+# if defined(OPUS_ARM_MAY_HAVE_EDSP)
   __try{
     /*PLD [r13]*/
     __emit(0xF5DDF000);
@@ -64,7 +64,7 @@ static inline opus_uint32 opus_cpu_capabilities(void){
   __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
     /*Ignore exception.*/
   }
-#  if defined(ARMv6E_ASM)
+#  if defined(OPUS_ARM_MAY_HAVE_MEDIA)
   __try{
     /*SHADD8 r3,r3,r3*/
     __emit(0xE6333F93);
@@ -73,7 +73,7 @@ static inline opus_uint32 opus_cpu_capabilities(void){
   __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
     /*Ignore exception.*/
   }
-#   if defined(ARM_HAVE_NEON)
+#   if defined(OPUS_ARM_MAY_HAVE_NEON)
   __try{
     /*VORR q0,q0,q0*/
     __emit(0xF2200150);
@@ -107,19 +107,26 @@ opus_uint32 opus_cpu_capabilities(void)
 
     while(fgets(buf, 512, cpuinfo) != NULL)
     {
+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON)
       /* Search for edsp and neon flag */
       if(memcmp(buf, "Features", 8) == 0)
       {
         char *p;
+#  if defined(OPUS_ARM_MAY_HAVE_EDSP)
         p = strstr(buf, " edsp");
         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
           flags |= OPUS_CPU_ARM_EDSP;
+#  endif
 
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
         p = strstr(buf, " neon");
         if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
           flags |= OPUS_CPU_ARM_NEON;
+#  endif
       }
+# endif
 
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
       /* Search for media capabilities (>= ARMv6) */
       if(memcmp(buf, "CPU architecture:", 17) == 0)
       {
@@ -129,6 +136,7 @@ opus_uint32 opus_cpu_capabilities(void)
         if(version >= 6)
           flags |= OPUS_CPU_ARM_MEDIA;
       }
+# endif
     }
 
     fclose(cpuinfo);
diff --git a/celt/arm/armcpu.h b/celt/arm/armcpu.h
index 68d80fe..ac57446 100644
--- a/celt/arm/armcpu.h
+++ b/celt/arm/armcpu.h
@@ -25,11 +25,47 @@
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-/* Original code from libtheora modified to suit to Opus */
+#if !defined(ARMCPU_H)
+# define ARMCPU_H
 
-#ifndef ARMCPU_H
-#define ARMCPU_H
+# if defined(OPUS_ARM_MAY_HAVE_EDSP)
+#  define MAY_HAVE_EDSP(name) name ## _edsp
+# else
+#  define MAY_HAVE_EDSP(name) name ## _c
+# endif
 
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+#  define MAY_HAVE_MEDIA(name) name ## _media
+# else
+#  define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name)
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON)
+#  define MAY_HAVE_NEON(name) name ## _neon
+# else
+#  define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
+# endif
+
+# if defined(OPUS_ARM_PRESUME_EDSP)
+#  define PRESUME_EDSP(name) name ## _edsp
+# else
+#  define PRESUME_EDSP(name) name ## _c
+# endif
+
+# if defined(OPUS_ARM_PRESUME_MEDIA)
+#  define PRESUME_MEDIA(name) name ## _media
+# else
+#  define PRESUME_MEDIA(name) PRESUME_EDSP(name)
+# endif
+
+# if defined(OPUS_ARM_PRESUME_NEON)
+#  define PRESUME_NEON(name) name ## _neon
+# else
+#  define PRESUME_NEON(name) PRESUME_MEDIA(name)
+# endif
+
+# if defined(OPUS_HAVE_RTCD)
 int opus_select_arch(void);
+# endif
 
 #endif
diff --git a/celt/arm/armopts.s.in b/celt/arm/armopts.s.in
new file mode 100644
index 0000000..3d8aaf2
--- /dev/null
+++ b/celt/arm/armopts.s.in
@@ -0,0 +1,37 @@
+/* Copyright (C) 2013 Mozilla Corporation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OPUS_ARM_MAY_HAVE_EDSP  * @OPUS_ARM_MAY_HAVE_EDSP@
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OPUS_ARM_MAY_HAVE_NEON  * @OPUS_ARM_MAY_HAVE_NEON@
+
+END
diff --git a/celt/arm/celt_pitch_xcorr_arm.s b/celt/arm/celt_pitch_xcorr_arm.s
new file mode 100644
index 0000000..09917b1
--- /dev/null
+++ b/celt/arm/celt_pitch_xcorr_arm.s
@@ -0,0 +1,545 @@
+; Copyright (c) 2007-2008 CSIRO
+; Copyright (c) 2007-2009 Xiph.Org Foundation
+; Copyright (c) 2013      Parrot
+; Written by Aurélien Zanelli
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+;
+; - Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+;
+; - Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  AREA  |.text|, CODE, READONLY
+
+  GET    celt/arm/armopts.s
+
+IF OPUS_ARM_MAY_HAVE_EDSP
+  EXPORT celt_pitch_xcorr_edsp
+ENDIF
+
+IF OPUS_ARM_MAY_HAVE_NEON
+  EXPORT celt_pitch_xcorr_neon
+ENDIF
+
+IF OPUS_ARM_MAY_HAVE_NEON
+
+; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
+xcorr_kernel_neon PROC
+  ; input:
+  ;   r3     = int         len
+  ;   r4     = opus_val16 *x
+  ;   r5     = opus_val16 *y
+  ;   q0     = opus_val32  sum[4]
+  ; output:
+  ;   q0     = opus_val32  sum[4]
+  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
+  ; internal usage:
+  ;   r12 = int j
+  ;   d3  = y_3|y_2|y_1|y_0
+  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
+  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
+  ;   q8  = scratch
+  ;
+  ; Load y[0...3]
+  ; This requires len>0 to always be valid (which we assert in the C code).
+  VLD1.16      {d5}, [r5]!
+  SUBS         r12, r3, #8
+  BLE xcorr_kernel_neon_process4
+; Process 8 samples at a time.
+; This loop loads one y value more than we actually need. Therefore we have to
+; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
+; reading past the end of the array.
+xcorr_kernel_neon_process8
+  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
+  ; - 2 cycles of ARM insrtuctions,
+  ; - 10 cycles of load/store/byte permute instructions, and
+  ; - 9 cycles of data processing instructions.
+  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
+  ; latter two categories, meaning the whole loop should run in 10 cycles per
+  ; iteration, barring cache misses.
+  ;
+  ; Load x[0...7]
+  VLD1.16      {d6, d7}, [r4]!
+  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
+  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
+  VAND         d3, d5, d5
+  SUBS         r12, r12, #8
+  ; Load y[4...11]
+  VLD1.16      {d4, d5}, [r5]!
+  VMLAL.S16    q0, d3, d6[0]
+  VEXT.16      d16, d3, d4, #1
+  VMLAL.S16    q0, d4, d7[0]
+  VEXT.16      d17, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d3, d4, #2
+  VMLAL.S16    q0, d17, d7[1]
+  VEXT.16      d17, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d3, d4, #3
+  VMLAL.S16    q0, d17, d7[2]
+  VEXT.16      d17, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+  VMLAL.S16    q0, d17, d7[3]
+  BGT xcorr_kernel_neon_process8
+; Process 4 samples here if we have > 4 left (still reading one extra y value).
+xcorr_kernel_neon_process4
+  ADDS         r12, r12, #4
+  BLE xcorr_kernel_neon_process2
+  ; Load x[0...3]
+  VLD1.16      d6, [r4]!
+  ; Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #4
+  ; Load y[4...7]
+  VLD1.16      d5, [r5]!
+  VMLAL.S16    q0, d4, d6[0]
+  VEXT.16      d16, d4, d5, #1
+  VMLAL.S16    q0, d16, d6[1]
+  VEXT.16      d16, d4, d5, #2
+  VMLAL.S16    q0, d16, d6[2]
+  VEXT.16      d16, d4, d5, #3
+  VMLAL.S16    q0, d16, d6[3]
+; Process 2 samples here if we have > 2 left (still reading one extra y value).
+xcorr_kernel_neon_process2
+  ADDS         r12, r12, #2
+  BLE xcorr_kernel_neon_process1
+  ; Load x[0...1]
+  VLD2.16      {d6[],d7[]}, [r4]!
+  ; Use VAND since it's a data processing instruction again.
+  VAND         d4, d5, d5
+  SUB          r12, r12, #2
+  ; Load y[4...5]
+  VLD1.32      {d5[]}, [r5]!
+  VMLAL.S16    q0, d4, d6
+  VEXT.16      d16, d4, d5, #1
+  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
+  ; instead of VEXT, since it's a data-processing instruction.
+  VSRI.64      d5, d4, #32
+  VMLAL.S16    q0, d16, d7
+; Process 1 sample using the extra y value we loaded above.
+xcorr_kernel_neon_process1
+  ; Load next *x
+  VLD1.16      {d6[]}, [r4]!
+  ADDS         r12, r12, #1
+  ; y[0...3] are left in d5 from prior iteration(s) (if any)
+  VMLAL.S16    q0, d5, d6
+  MOVLE        pc, lr
+; Now process 1 last sample, not reading ahead.
+  ; Load last *y
+  VLD1.16      {d4[]}, [r5]!
+  VSRI.64      d4, d5, #16
+  ; Load last *x
+  VLD1.16      {d6[]}, [r4]!
+  VMLAL.S16    q0, d4, d6
+  MOV          pc, lr
+  ENDP
+
+; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
+;  opus_val32 *xcorr, int len, int max_pitch)
+celt_pitch_xcorr_neon PROC
+  ; input:
+  ;   r0  = opus_val16 *_x
+  ;   r1  = opus_val16 *_y
+  ;   r2  = opus_val32 *xcorr
+  ;   r3  = int         len
+  ; output:
+  ;   r0  = int         maxcorr
+  ; internal usage:
+  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
+  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
+  ;   r6  = int         max_pitch
+  ;   r12 = int         j
+  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+  STMFD        sp!, {r4-r6, lr}
+  LDR          r6, [sp, #16]
+  VMOV.S32     q15, #1
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  SUBS         r6, r6, #4
+  BLT celt_pitch_xcorr_neon_process4_done
+celt_pitch_xcorr_neon_process4
+  ; xcorr_kernel_neon parameters:
+  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
+  MOV          r4, r0
+  MOV          r5, r1
+  VEOR         q0, q0, q0
+  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
+  ; So we don't save/restore any other registers.
+  BL xcorr_kernel_neon
+  SUBS         r6, r6, #4
+  VST1.32      {q0}, [r2]!
+  ; _y += 4
+  ADD          r1, r1, #8
+  VMAX.S32     q15, q15, q0
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
+  BGE celt_pitch_xcorr_neon_process4
+; We have less than 4 sums left to compute.
+celt_pitch_xcorr_neon_process4_done
+  ADDS         r6, r6, #4
+  ; Reduce maxcorr to a single value
+  VMAX.S32     d30, d30, d31
+  VPMAX.S32    d30, d30, d30
+  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
+  BLE celt_pitch_xcorr_neon_done
+; Now compute each remaining sum one at a time.
+celt_pitch_xcorr_neon_process_remaining
+  MOV          r4, r0
+  MOV          r5, r1
+  VMOV.I32     q0, #0
+  SUBS         r12, r3, #8
+  BLT celt_pitch_xcorr_neon_process_remaining4
+; Sum terms 8 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop8
+  ; Load x[0...7]
+  VLD1.16      {q1}, [r4]!
+  ; Load y[0...7]
+  VLD1.16      {q2}, [r5]!
+  SUBS         r12, r12, #8
+  VMLAL.S16    q0, d4, d2
+  VMLAL.S16    q0, d5, d3
+  BGE celt_pitch_xcorr_neon_process_remaining_loop8
+; Sum terms 4 at a time.
+celt_pitch_xcorr_neon_process_remaining4
+  ADDS         r12, r12, #4
+  BLT celt_pitch_xcorr_neon_process_remaining4_done
+  ; Load x[0...3]
+  VLD1.16      {d2}, [r4]!
+  ; Load y[0...3]
+  VLD1.16      {d3}, [r5]!
+  SUB          r12, r12, #4
+  VMLAL.S16    q0, d3, d2
+celt_pitch_xcorr_neon_process_remaining4_done
+  ; Reduce the sum to a single value.
+  VADD.S32     d0, d0, d1
+  VPADDL.S32   d0, d0
+  ADDS         r12, r12, #4
+  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
+; Sum terms 1 at a time.
+celt_pitch_xcorr_neon_process_remaining_loop1
+  VLD1.16      {d2[]}, [r4]!
+  VLD1.16      {d3[]}, [r5]!
+  SUBS         r12, r12, #1
+  VMLAL.S16    q0, d2, d3
+  BGT celt_pitch_xcorr_neon_process_remaining_loop1
+celt_pitch_xcorr_neon_process_remaining_loop_done
+  VST1.32      {d0[0]}, [r2]!
+  VMAX.S32     d30, d30, d0
+  SUBS         r6, r6, #1
+  ; _y++
+  ADD          r1, r1, #2
+  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
+  BGT celt_pitch_xcorr_neon_process_remaining
+celt_pitch_xcorr_neon_done
+  VMOV.32      r0, d30[0]
+  LDMFD        sp!, {r4-r6, pc}
+  ENDP
+
+ENDIF
+
+IF OPUS_ARM_MAY_HAVE_EDSP
+
+; This will get used on ARMv7 devices without NEON, so it has been optimized
+; to take advantage of dual-issuing where possible.
+xcorr_kernel_edsp PROC
+  ; input:
+  ;   r3      = int         len
+  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
+  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
+  ;   r6...r9 = opus_val32  sum[4]
+  ; output:
+  ;   r6...r9 = opus_val32  sum[4]
+  ; preserved: r0-r5
+  ; internal usage
+  ;   r2      = int         j
+  ;   r12,r14 = opus_val16  x[4]
+  ;   r10,r11 = opus_val16  y[4]
+  STMFD        sp!, {r2,r4,r5,lr}
+  LDR          r10, [r5], #4      ; Load y[0...1]
+  SUBS         r2, r3, #4         ; j = len-4
+  LDR          r11, [r5], #4      ; Load y[2...3]
+  BLE xcorr_kernel_edsp_process4_done
+  LDR          r12, [r4], #4      ; Load x[0...1]
+  ; Stall
+xcorr_kernel_edsp_process4
+  ; The multiplies must issue from pipeline 0, and can't dual-issue with each
+  ; other. Every other instruction here dual-issues with a multiply, and is
+  ; thus "free". There should be no stalls in the body of the loop.
+  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
+  LDR          r14, [r4], #4      ; Load x[2...3]
+  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
+  SUBS         r2, r2, #4         ; j-=4
+  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
+  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
+  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
+  LDR          r10, [r5], #4      ; Load y[4...5]
+  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
+  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
+  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
+  LDRGT        r12, [r4], #4      ; Load x[0...1]
+  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
+  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
+  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
+  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
+  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
+  LDR          r11, [r5], #4      ; Load y[6...7]
+  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
+  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
+  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
+  BGT xcorr_kernel_edsp_process4
+xcorr_kernel_edsp_process4_done
+  ADDS         r2, r2, #4
+  BLE xcorr_kernel_edsp_done
+  LDRH         r12, [r4], #2      ; r12 = *x++
+  SUBS         r2, r2, #1         ; j--
+  ; Stall
+  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
+  LDRGTH       r14, [r4], #2      ; r14 = *x++
+  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
+  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
+  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
+  SUBS         r2, r2, #1         ; j--
+  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
+  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
+  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
+  LDRGTH       r12, [r4], #2      ; r12 = *x++
+  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
+  BLE xcorr_kernel_edsp_done
+  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
+  CMP          r2, #1             ; j--
+  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
+  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
+  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
+  LDRGTH       r14, [r4]          ; r14 = *x
+  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
+  BLE xcorr_kernel_edsp_done
+  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
+  LDRH         r11, [r5]          ; r11 = y_6 = *y
+  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
+  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
+  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
+xcorr_kernel_edsp_done
+  LDMFD        sp!, {r2,r4,r5,pc}
+  ENDP
+
+celt_pitch_xcorr_edsp PROC
+  ; input:
+  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
+  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
+  ;   r2  = opus_val32 *xcorr
+  ;   r3  = int         len
+  ; output:
+  ;   r0  = maxcorr
+  ; internal usage
+  ;   r4  = opus_val16 *x
+  ;   r5  = opus_val16 *y
+  ;   r6  = opus_val32  sum0
+  ;   r7  = opus_val32  sum1
+  ;   r8  = opus_val32  sum2
+  ;   r9  = opus_val32  sum3
+  ;   r1  = int         max_pitch
+  ;   r12 = int         j
+  STMFD        sp!, {r4-r11, lr}
+  MOV          r5, r1
+  LDR          r1, [sp, #36]
+  MOV          r4, r0
+  TST          r5, #3
+  ; maxcorr = 1
+  MOV          r0, #1
+  BEQ          celt_pitch_xcorr_edsp_process1u_done
+; Compute one sum at the start to make y 32-bit aligned.
+  SUBS         r12, r3, #4
+  ; r14 = sum = 0
+  MOV          r14, #0
+  LDRH         r8, [r5], #2
+  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
+  LDR          r6, [r4], #4
+  MOV          r8, r8, LSL #16
+celt_pitch_xcorr_edsp_process1u_loop4
+  LDR          r9, [r5], #4
+  SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
+  LDRGT        r6, [r4], #4
+  BGT celt_pitch_xcorr_edsp_process1u_loop4
+  MOV          r8, r8, LSR #16
+celt_pitch_xcorr_edsp_process1u_loop4_done
+  ADDS         r12, r12, #4
+celt_pitch_xcorr_edsp_process1u_loop1
+  LDRGEH       r6, [r4], #2
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
+  SUBGES       r12, r12, #1
+  LDRGTH       r8, [r5], #2
+  BGT celt_pitch_xcorr_edsp_process1u_loop1
+  ; Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ; Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ; maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ADD          r5, r5, #2
+  MOVLT        r0, r14
+  SUBS         r1, r1, #1
+  ; xcorr[i] = sum
+  STR          r14, [r2], #4
+  BLE celt_pitch_xcorr_edsp_done
+celt_pitch_xcorr_edsp_process1u_done
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
+  SUBS         r1, r1, #4
+  BLT celt_pitch_xcorr_edsp_process2
+celt_pitch_xcorr_edsp_process4
+  ; xcorr_kernel_edsp parameters:
+  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
+  MOV          r6, #0
+  MOV          r7, #0
+  MOV          r8, #0
+  MOV          r9, #0
+  BL xcorr_kernel_edsp  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
+  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
+  CMP          r0, r6
+  ; _y+=4
+  ADD          r5, r5, #8
+  MOVLT        r0, r6
+  CMP          r0, r7
+  MOVLT        r0, r7
+  CMP          r0, r8
+  MOVLT        r0, r8
+  CMP          r0, r9
+  MOVLT        r0, r9
+  STMIA        r2!, {r6-r9}
+  SUBS         r1, r1, #4
+  BGE celt_pitch_xcorr_edsp_process4
+celt_pitch_xcorr_edsp_process2
+  ADDS         r1, r1, #2
+  BLT celt_pitch_xcorr_edsp_process1a
+  SUBS         r12, r3, #4
+  ; {r10, r11} = {sum0, sum1} = {0, 0}
+  MOV          r10, #0
+  MOV          r11, #0
+  LDR          r8, [r5], #4
+  BLE celt_pitch_xcorr_edsp_process2_loop_done
+  LDR          r6, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process2_loop4
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
+  LDRGT        r6, [r4], #4
+  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
+  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
+  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
+  LDRGT        r9, [r5], #4
+  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
+  BGT celt_pitch_xcorr_edsp_process2_loop4
+celt_pitch_xcorr_edsp_process2_loop_done
+  ADDS         r12, r12, #2
+  BLE  celt_pitch_xcorr_edsp_process2_1
+  LDR          r6, [r4], #4
+  ; Stall
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r9, [r5], #4
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  SUB          r12, r12, #2
+  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
+  MOV          r8, r9
+  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
+celt_pitch_xcorr_edsp_process2_1
+  LDRH         r6, [r4], #2
+  ADDS         r12, r12, #1
+  ; Stall
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDRGTH       r7, [r4], #2
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  BLE celt_pitch_xcorr_edsp_process2_done
+  LDRH         r9, [r5], #2
+  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
+  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
+celt_pitch_xcorr_edsp_process2_done
+  ; Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ; Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ; maxcorr = max(maxcorr, sum0)
+  CMP          r0, r10
+  ADD          r5, r5, #2
+  MOVLT        r0, r10
+  SUB          r1, r1, #2
+  ; maxcorr = max(maxcorr, sum1)
+  CMP          r0, r11
+  ; xcorr[i] = sum
+  STR          r10, [r2], #4
+  MOVLT        r0, r11
+  STR          r11, [r2], #4
+celt_pitch_xcorr_edsp_process1a
+  ADDS         r1, r1, #1
+  BLT celt_pitch_xcorr_edsp_done
+  SUBS         r12, r3, #4
+  ; r14 = sum = 0
+  MOV          r14, #0
+  BLT celt_pitch_xcorr_edsp_process1a_loop_done
+  LDR          r6, [r4], #4
+  LDR          r8, [r5], #4
+  LDR          r7, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process1a_loop4
+  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  LDRGE        r6, [r4], #4
+  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
+  LDRGE        r8, [r5], #4
+  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
+  LDRGE        r7, [r4], #4
+  LDRGE        r9, [r5], #4
+  BGE celt_pitch_xcorr_edsp_process1a_loop4
+celt_pitch_xcorr_edsp_process1a_loop_done
+  ADDS         r12, r12, #2
+  LDRGE        r6, [r4], #4
+  LDRGE        r8, [r5], #4
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  SUBGE        r12, r12, #2
+  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  ADDS         r12, r12, #1
+  LDRGEH       r6, [r4], #2
+  LDRGEH       r8, [r5], #2
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
+  ; maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ; xcorr[i] = sum
+  STR          r14, [r2], #4
+  MOVLT        r0, r14
+celt_pitch_xcorr_edsp_done
+  LDMFD        sp!, {r4-r11, pc}
+  ENDP
+
+ENDIF
+
+END
diff --git a/celt/arm/fixed_armv4.h b/celt/arm/fixed_armv4.h
index bcacc34..b690bc8 100644
--- a/celt/arm/fixed_armv4.h
+++ b/celt/arm/fixed_armv4.h
@@ -29,7 +29,7 @@
 
 /** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
 #undef MULT16_32_Q16
-static inline opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
+static OPUS_INLINE opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
 {
   unsigned rd_lo;
   int rd_hi;
@@ -46,7 +46,7 @@ static inline opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
 
 /** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
 #undef MULT16_32_Q15
-static inline opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
+static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
 {
   unsigned rd_lo;
   int rd_hi;
diff --git a/celt/arm/fixed_armv5e.h b/celt/arm/fixed_armv5e.h
index 80632c4..1194a7d 100644
--- a/celt/arm/fixed_armv5e.h
+++ b/celt/arm/fixed_armv5e.h
@@ -34,7 +34,7 @@
 
 /** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
 #undef MULT16_32_Q16
-static inline opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b)
+static OPUS_INLINE opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b)
 {
   int res;
   __asm__(
@@ -50,7 +50,7 @@ static inline opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b)
 
 /** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
 #undef MULT16_32_Q15
-static inline opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
+static OPUS_INLINE opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
 {
   int res;
   __asm__(
@@ -68,7 +68,7 @@ static inline opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
     b must fit in 31 bits.
     Result fits in 32 bits. */
 #undef MAC16_32_Q15
-static inline opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
+static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
  opus_val32 b)
 {
   int res;
@@ -84,7 +84,7 @@ static inline opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
 
 /** 16x16 multiply-add where the result fits in 32 bits */
 #undef MAC16_16
-static inline opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
+static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
  opus_val16 b)
 {
   int res;
@@ -100,7 +100,7 @@ static inline opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
 
 /** 16x16 multiplication where the result fits in 32 bits */
 #undef MULT16_16
-static inline opus_val32 MULT16_16_armv5e(opus_val16 a, opus_val16 b)
+static OPUS_INLINE opus_val32 MULT16_16_armv5e(opus_val16 a, opus_val16 b)
 {
   int res;
   __asm__(
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
new file mode 100644
index 0000000..a07f8ac
--- /dev/null
+++ b/celt/arm/pitch_arm.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(PITCH_ARM_H)
+# define PITCH_ARM_H
+
+# include "armcpu.h"
+
+# if defined(FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
+    opus_val32 *xcorr, int len, int max_pitch);
+#  endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+#   define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr)
+#  endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_EDSP)
+opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
+    opus_val32 *xcorr, int len, int max_pitch);
+#  endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_PITCH_XCORR (1)
+#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
+#  endif
+
+# endif
+
+#endif
diff --git a/celt/bands.c b/celt/bands.c
index 93bd0bc..cce56e2 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -214,7 +214,9 @@ void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
          j=M*eBands[i];
          band_end = M*eBands[i+1];
          lg = ADD16(bandLogE[i+c*m->nbEBands], SHL16((opus_val16)eMeans[i],6));
-#ifdef FIXED_POINT
+#ifndef FIXED_POINT
+         g = celt_exp2(lg);
+#else
          /* Handle the integer part of the log energy */
          shift = 16-(lg>>DB_SHIFT);
          if (shift>31)
@@ -225,9 +227,23 @@ void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
             /* Handle the fractional part. */
             g = celt_exp2_frac(lg&((1<<DB_SHIFT)-1));
          }
-#else
-         g = celt_exp2(lg);
+         /* Handle extreme gains with negative shift. */
+         if (shift<0)
+         {
+            /* For shift < -2 we'd be likely to overflow, so we're capping
+               the gain here. This shouldn't happen unless the bitstream is
+               already corrupted. */
+            if (shift < -2)
+            {
+               g = 32767;
+               shift = -2;
+            }
+            do {
+               *f++ = SHL32(MULT16_16(*x++, g), -shift);
+            } while (++j<band_end);
+         } else
 #endif
+         /* Be careful of the fixed-point "else" just above when changing this code */
          do {
             *f++ = SHR32(MULT16_16(*x++, g), shift);
          } while (++j<band_end);
@@ -492,7 +508,7 @@ int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
          *tapset_decision=0;
    }
    /*printf("%d %d %d\n", hf_sum, *hf_average, *tapset_decision);*/
-   celt_assert(nbBands>0); /*M*(eBands[end]-eBands[end-1]) <= 8 assures this*/
+   celt_assert(nbBands>0); /* end has to be non-zero */
    sum /= nbBands;
    /* Recursive averaging */
    sum = (sum+*average)>>1;
@@ -869,7 +885,6 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X,
    int q;
    int curr_bits;
    int imid=0, iside=0;
-   int N_B=N;
    int B0=B;
    opus_val16 mid=0, side=0;
    unsigned cm=0;
@@ -891,8 +906,6 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X,
    spread = ctx->spread;
    ec = ctx->ec;
 
-   N_B /= B;
-
    /* If we need 1.5 more bit than we can produce, split the band in two. */
    cache = m->cache.bits + m->cache.index[(LM+1)*m->nbEBands+i];
    if (LM != -1 && b > cache[cache[0]]+12 && N>2)
@@ -1072,7 +1085,6 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,
    longBlocks = B0==1;
 
    N_B /= B;
-   N_B0 = N_B;
 
    /* Special case for one sample */
    if (N==1)
diff --git a/celt/celt.h b/celt/celt.h
index cdb76c8..5deea1f 100644
--- a/celt/celt.h
+++ b/celt/celt.h
@@ -66,6 +66,10 @@ typedef struct {
 
 /* Encoder/decoder Requests */
 
+/* Expose this option again when variable framesize actually works */
+#define OPUS_FRAMESIZE_VARIABLE              5010 /**< Optimize the frame size dynamically */
+
+
 #define CELT_SET_PREDICTION_REQUEST    10002
 /** Controls the use of interframe prediction.
     0=Independent frames
@@ -118,7 +122,8 @@ int celt_encoder_get_size(int channels);
 
 int celt_encode_with_ec(OpusCustomEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc);
 
-int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels);
+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels,
+                      int arch);
 
 
 
@@ -138,7 +143,7 @@ int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned cha
 #ifdef CUSTOM_MODES
 #define OPUS_CUSTOM_NOSTATIC
 #else
-#define OPUS_CUSTOM_NOSTATIC static inline
+#define OPUS_CUSTOM_NOSTATIC static OPUS_INLINE
 #endif
 
 static const unsigned char trim_icdf[11] = {126, 124, 119, 109, 87, 41, 19, 9, 4, 2, 0};
@@ -163,7 +168,7 @@ static const unsigned char fromOpusTable[16] = {
       0x00, 0x08, 0x10, 0x18
 };
 
-static inline int toOpus(unsigned char c)
+static OPUS_INLINE int toOpus(unsigned char c)
 {
    int ret=0;
    if (c<0xA0)
@@ -174,7 +179,7 @@ static inline int toOpus(unsigned char c)
       return ret|(c&0x7);
 }
 
-static inline int fromOpus(unsigned char c)
+static OPUS_INLINE int fromOpus(unsigned char c)
 {
    if (c<0x80)
       return -1;
@@ -190,7 +195,7 @@ extern const signed char tf_select_table[4][8];
 
 int resampling_factor(opus_int32 rate);
 
-void preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
+void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
                         int N, int CC, int upsample, const opus_val16 *coef, celt_sig *mem, int clip);
 
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 4424b97..830398e 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -175,7 +175,7 @@ void opus_custom_decoder_destroy(CELTDecoder *st)
 }
 #endif /* CUSTOM_MODES */
 
-static inline opus_val16 SIG2WORD16(celt_sig x)
+static OPUS_INLINE opus_val16 SIG2WORD16(celt_sig x)
 {
 #ifdef FIXED_POINT
    x = PSHR32(x, SIG_SHIFT);
@@ -213,7 +213,7 @@ void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, c
          opus_val16 coef3 = coef[3];
          for (j=0;j<N;j++)
          {
-            celt_sig tmp = x[j] + m;
+            celt_sig tmp = x[j] + m + VERY_SMALL;
             m = MULT16_32_Q15(coef0, tmp)
                           - MULT16_32_Q15(coef1, x[j]);
             tmp = SHL32(MULT16_32_Q15(coef3, tmp), 2);
@@ -227,7 +227,7 @@ void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, c
          /* Shortcut for the standard (non-custom modes) case */
          for (j=0;j<N;j++)
          {
-            celt_sig tmp = x[j] + m;
+            celt_sig tmp = x[j] + m + VERY_SMALL;
             m = MULT16_32_Q15(coef0, tmp);
             scratch[j] = tmp;
          }
@@ -447,10 +447,11 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R
       {
          VARDECL( opus_val16, lp_pitch_buf );
          ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 );
-         pitch_downsample(decode_mem, lp_pitch_buf, DECODE_BUFFER_SIZE, C);
+         pitch_downsample(decode_mem, lp_pitch_buf,
+               DECODE_BUFFER_SIZE, C, st->arch);
          pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf,
                DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX,
-               PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index);
+               PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, st->arch);
          pitch_index = PLC_PITCH_LAG_MAX-pitch_index;
          st->last_pitch_index = pitch_index;
       } else {
@@ -481,7 +482,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R
             opus_val32 ac[LPC_ORDER+1];
             /* Compute LPC coefficients for the last MAX_PERIOD samples before
                the first loss so we can work in the excitation-filter domain. */
-            _celt_autocorr(exc, ac, window, overlap, LPC_ORDER, MAX_PERIOD);
+            _celt_autocorr(exc, ac, window, overlap,
+                   LPC_ORDER, MAX_PERIOD, st->arch);
             /* Add a noise floor of -40 dB. */
 #ifdef FIXED_POINT
             ac[0] += SHR32(ac[0],13);
@@ -665,7 +667,6 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    VARDECL(int, fine_priority);
    VARDECL(int, tf_res);
    VARDECL(unsigned char, collapse_masks);
-   celt_sig *out_mem[2];
    celt_sig *decode_mem[2];
    celt_sig *out_syn[2];
    opus_val16 *lpc;
@@ -706,7 +707,6 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
 
    c=0; do {
       decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
-      out_mem[c] = decode_mem[c]+DECODE_BUFFER_SIZE-MAX_PERIOD;
    } while (++c<CC);
    lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC);
    oldBandE = lpc+CC*LPC_ORDER;
@@ -936,7 +936,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    } while (++c<C);
 
    c=0; do {
-      out_syn[c] = out_mem[c]+MAX_PERIOD-N;
+      out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
    } while (++c<CC);
 
    if (CC==2&&C==1)
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index 59dcc5c..ffff077 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -161,17 +161,8 @@ CELTEncoder *opus_custom_encoder_create(const CELTMode *mode, int channels, int
 }
 #endif /* CUSTOM_MODES */
 
-int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels)
-{
-   int ret;
-   ret = opus_custom_encoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels);
-   if (ret != OPUS_OK)
-      return ret;
-   st->upsample = resampling_factor(sampling_rate);
-   return OPUS_OK;
-}
-
-OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels)
+static int opus_custom_encoder_init_arch(CELTEncoder *st, const CELTMode *mode,
+                                         int channels, int arch)
 {
    if (channels < 0 || channels > 2)
       return OPUS_BAD_ARG;
@@ -190,7 +181,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMod
    st->end = st->mode->effEBands;
    st->signalling = 1;
 
-   st->arch = opus_select_arch();
+   st->arch = arch;
 
    st->constrained_vbr = 1;
    st->clip = 1;
@@ -206,6 +197,25 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMod
    return OPUS_OK;
 }
 
+#ifdef CUSTOM_MODES
+int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels)
+{
+   return opus_custom_encoder_init_arch(st, mode, channels, opus_select_arch());
+}
+#endif
+
+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels,
+                      int arch)
+{
+   int ret;
+   ret = opus_custom_encoder_init_arch(st,
+           opus_custom_mode_create(48000, 960, NULL), channels, arch);
+   if (ret != OPUS_OK)
+      return ret;
+   st->upsample = resampling_factor(sampling_rate);
+   return OPUS_OK;
+}
+
 #ifdef CUSTOM_MODES
 void opus_custom_encoder_destroy(CELTEncoder *st)
 {
@@ -240,7 +250,6 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int
    ALLOC(tmp, len, opus_val16);
 
    len2=len/2;
-   tf_max = 0;
    for (c=0;c<C;c++)
    {
       opus_val32 mean;
@@ -451,7 +460,7 @@ static void compute_mdcts(const CELTMode *mode, int shortBlocks, celt_sig * OPUS
 }
 
 
-void preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
+void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
                         int N, int CC, int upsample, const opus_val16 *coef, celt_sig *mem, int clip)
 {
    int i;
@@ -488,6 +497,8 @@ void preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT
       for (i=0;i<Nu;i++)
          inp[i*upsample] = MAX32(-65536.f, MIN32(65536.f,inp[i*upsample]));
    }
+#else
+   (void)clip; /* Avoids a warning about clip being unused. */
 #endif
    m = *mem;
 #ifdef CUSTOM_MODES
@@ -497,7 +508,7 @@ void preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT
       opus_val16 coef2 = coef[2];
       for (i=0;i<N;i++)
       {
-         opus_val16 x, tmp;
+         celt_sig x, tmp;
          x = inp[i];
          /* Apply pre-emphasis */
          tmp = MULT16_16(coef2, x);
@@ -822,7 +833,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
 #ifndef DISABLE_FLOAT_API
    if (analysis->valid)
    {
-      trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8), QCONST16(2.f, 8)*(analysis->tonality_slope+.05f)));
+      trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8),
+            (opus_val16)(QCONST16(2.f, 8)*(analysis->tonality_slope+.05f))));
    }
 #endif
 
@@ -1023,11 +1035,12 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem,
       VARDECL(opus_val16, pitch_buf);
       ALLOC(pitch_buf, (COMBFILTER_MAXPERIOD+N)>>1, opus_val16);
 
-      pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC);
+      pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC, st->arch);
       /* Don't search for the fir last 1.5 octave of the range because
          there's too many false-positives due to short-term correlation */
       pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N,
-            COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index);
+            COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index,
+            st->arch);
       pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
 
       gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
@@ -1156,6 +1169,7 @@ static int compute_vbr(const CELTMode *mode, AnalysisInfo *analysis, opus_int32
       coded_stereo_dof = (eBands[coded_stereo_bands]<<LM)-coded_stereo_bands;
       /* Maximum fraction of the bits we can save if the signal is mono. */
       max_frac = DIV32_16(MULT16_16(QCONST16(0.8f, 15), coded_stereo_dof), coded_bins);
+      stereo_saving = MIN16(stereo_saving, QCONST16(1.f, 8));
       /*printf("%d %d %d ", coded_stereo_dof, coded_bins, tot_boost);*/
       target -= (opus_int32)MIN32(MULT16_32_Q15(max_frac,target),
                       SHR32(MULT16_16(stereo_saving-QCONST16(0.1f,8),(coded_stereo_dof<<BITRES)),8));
@@ -1294,6 +1308,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
    opus_val16 surround_masking=0;
    opus_val16 temporal_vbr=0;
    opus_val16 surround_trim = 0;
+   opus_int32 equiv_rate = 510000;
    VARDECL(opus_val16, surround_dynalloc);
    ALLOC_STACK;
 
@@ -1303,14 +1318,20 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
    eBands = mode->eBands;
    tf_estimate = 0;
    if (nbCompressedBytes<2 || pcm==NULL)
-     return OPUS_BAD_ARG;
+   {
+      RESTORE_STACK;
+      return OPUS_BAD_ARG;
+   }
 
    frame_size *= st->upsample;
    for (LM=0;LM<=mode->maxLM;LM++)
       if (mode->shortMdctSize<<LM==frame_size)
          break;
    if (LM>mode->maxLM)
+   {
+      RESTORE_STACK;
       return OPUS_BAD_ARG;
+   }
    M=1<<LM;
    N = M*mode->shortMdctSize;
 
@@ -1341,7 +1362,10 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
       {
          int c0 = toOpus(compressed[0]);
          if (c0<0)
+         {
+            RESTORE_STACK;
             return OPUS_BAD_ARG;
+         }
          compressed[0] = c0;
       }
       compressed++;
@@ -1375,6 +1399,8 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
                (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling));
       effectiveBytes = nbCompressedBytes;
    }
+   if (st->bitrate != OPUS_BITRATE_MAX)
+      equiv_rate = st->bitrate - (40*C+20)*((400>>LM) - 50);
 
    if (enc==NULL)
    {
@@ -1448,7 +1474,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
       enc->nbits_total+=tell-ec_tell(enc);
    }
    c=0; do {
-      preemphasis(pcm+c, in+c*(N+st->overlap)+st->overlap, N, CC, st->upsample,
+      celt_preemphasis(pcm+c, in+c*(N+st->overlap)+st->overlap, N, CC, st->upsample,
                   mode->preemph, st->preemph_memE+c, st->clip);
    } while (++c<CC);
 
@@ -1458,7 +1484,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
    {
       int enabled;
       int qg;
-      enabled = (st->lfe || nbAvailableBytes>12*C) && st->start==0 && !silence && !st->disable_pf
+      enabled = ((st->lfe&&nbAvailableBytes>3) || nbAvailableBytes>12*C) && st->start==0 && !silence && !st->disable_pf
             && st->complexity >= 5 && !(st->consec_transient && LM!=3 && st->variable_duration==OPUS_FRAMESIZE_VARIABLE);
 
       prefilter_tapset = st->tapset_decision;
@@ -1612,7 +1638,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
    if (!st->lfe)
    {
       opus_val16 follow=-QCONST16(10.0f,DB_SHIFT);
-      float frame_avg=0;
+      opus_val32 frame_avg=0;
       opus_val16 offset = shortBlocks?HALF16(SHL16(LM, DB_SHIFT)):0;
       for(i=st->start;i<st->end;i++)
       {
@@ -1710,7 +1736,8 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
          /* Disable new spreading+tapset estimator until we can show it works
             better than the old one. So far it seems like spreading_decision()
             works best. */
-         if (0&&st->analysis.valid)
+#if 0
+         if (st->analysis.valid)
          {
             static const opus_val16 spread_thresholds[3] = {-QCONST16(.6f, 15), -QCONST16(.2f, 15), -QCONST16(.07f, 15)};
             static const opus_val16 spread_histeresis[3] = {QCONST16(.15f, 15), QCONST16(.07f, 15), QCONST16(.02f, 15)};
@@ -1718,7 +1745,9 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
             static const opus_val16 tapset_histeresis[2] = {QCONST16(.1f, 15), QCONST16(.05f, 15)};
             st->spread_decision = hysteresis_decision(-st->analysis.tonality, spread_thresholds, spread_histeresis, 3, st->spread_decision);
             st->tapset_decision = hysteresis_decision(st->analysis.tonality_slope, tapset_thresholds, tapset_histeresis, 2, st->tapset_decision);
-         } else {
+         } else
+#endif
+         {
             st->spread_decision = spreading_decision(mode, X,
                   &st->tonal_average, st->spread_decision, &st->hf_average,
                   &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M);
@@ -1777,25 +1806,18 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
 
    if (C==2)
    {
-      int effectiveRate;
-
       static const opus_val16 intensity_thresholds[21]=
       /* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19  20  off*/
-        { 16,21,23,25,27,29,31,33,35,38,42,46,50,54,58,63,68,75,84,102,130};
+        {  1, 2, 3, 4, 5, 6, 7, 8,16,24,36,44,50,56,62,67,72,79,88,106,134};
       static const opus_val16 intensity_histeresis[21]=
-        {  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 5, 6,  8, 12};
+        {  1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, 5, 6,  8, 8};
 
       /* Always use MS for 2.5 ms frames until we can do a better analysis */
       if (LM!=0)
          dual_stereo = stereo_analysis(mode, X, LM, N);
 
-      /* Account for coarse energy */
-      effectiveRate = (8*effectiveBytes - 80)>>LM;
-
-      /* effectiveRate in kb/s */
-      effectiveRate = 2*effectiveRate/5;
-
-      st->intensity = hysteresis_decision((opus_val16)effectiveRate, intensity_thresholds, intensity_histeresis, 21, st->intensity);
+      st->intensity = hysteresis_decision((opus_val16)(equiv_rate/1000),
+            intensity_thresholds, intensity_histeresis, 21, st->intensity);
       st->intensity = IMIN(st->end,IMAX(st->start, st->intensity));
    }
 
@@ -1829,7 +1851,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
      if (st->constrained_vbr)
         base_target += (st->vbr_offset>>lm_diff);
 
-     target = compute_vbr(mode, &st->analysis, base_target, LM, st->bitrate,
+     target = compute_vbr(mode, &st->analysis, base_target, LM, equiv_rate,
            st->lastCodedBands, C, st->intensity, st->constrained_vbr,
            st->stereo_saving, tot_boost, tf_estimate, pitch_change, maxDepth,
            st->variable_duration, st->lfe, st->energy_mask!=NULL, surround_masking,
@@ -1913,13 +1935,13 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
    if (st->analysis.valid)
    {
       int min_bandwidth;
-      if (st->bitrate < (opus_int32)32000*C)
+      if (equiv_rate < (opus_int32)32000*C)
          min_bandwidth = 13;
-      else if (st->bitrate < (opus_int32)48000*C)
+      else if (equiv_rate < (opus_int32)48000*C)
          min_bandwidth = 16;
-      else if (st->bitrate < (opus_int32)60000*C)
+      else if (equiv_rate < (opus_int32)60000*C)
          min_bandwidth = 18;
-      else  if (st->bitrate < (opus_int32)80000*C)
+      else  if (equiv_rate < (opus_int32)80000*C)
          min_bandwidth = 19;
       else
          min_bandwidth = 20;
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index 7ffe90a..fa29d62 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -226,7 +226,8 @@ int _celt_autocorr(
                    const opus_val16       *window,
                    int          overlap,
                    int          lag,
-                   int          n
+                   int          n,
+                   int          arch
                   )
 {
    opus_val32 d;
@@ -275,7 +276,7 @@ int _celt_autocorr(
          shift = 0;
    }
 #endif
-   celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);
+   celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1, arch);
    for (k=0;k<=lag;k++)
    {
       for (i = k+fastN, d = 0; i < n; i++)
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index 19279a0..dc2a0a3 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -48,6 +48,7 @@ void celt_iir(const opus_val32 *x,
          int ord,
          opus_val16 *mem);
 
-int _celt_autocorr(const opus_val16 *x, opus_val32 *ac, const opus_val16 *window, int overlap, int lag, int n);
+int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
+         const opus_val16 *window, int overlap, int lag, int n, int arch);
 
 #endif /* PLC_H */
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index 41481fe..d68dbe6 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -28,7 +28,10 @@
 #ifndef CPU_SUPPORT_H
 #define CPU_SUPPORT_H
 
-#if defined(OPUS_HAVE_RTCD) && defined(ARMv4_ASM)
+#include "opus_types.h"
+#include "opus_defines.h"
+
+#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
 #include "arm/armcpu.h"
 
 /* We currently support 4 ARM variants:
@@ -42,7 +45,7 @@
 #else
 #define OPUS_ARCHMASK 0
 
-static inline int opus_select_arch(void)
+static OPUS_INLINE int opus_select_arch(void)
 {
   return 0;
 }
diff --git a/celt/cwrs.c b/celt/cwrs.c
index 28e6561..ad980cc 100644
--- a/celt/cwrs.c
+++ b/celt/cwrs.c
@@ -410,7 +410,7 @@ static const opus_uint32 CELT_PVQ_U_DATA[1272]={
 };
 
 #if defined(CUSTOM_MODES)
-const opus_uint32 *const CELT_PVQ_U_ROW[15]={
+static const opus_uint32 *const CELT_PVQ_U_ROW[15]={
   CELT_PVQ_U_DATA+   0,CELT_PVQ_U_DATA+ 208,CELT_PVQ_U_DATA+ 415,
   CELT_PVQ_U_DATA+ 621,CELT_PVQ_U_DATA+ 826,CELT_PVQ_U_DATA+1030,
   CELT_PVQ_U_DATA+1233,CELT_PVQ_U_DATA+1336,CELT_PVQ_U_DATA+1389,
@@ -418,7 +418,7 @@ const opus_uint32 *const CELT_PVQ_U_ROW[15]={
   CELT_PVQ_U_DATA+1464,CELT_PVQ_U_DATA+1470,CELT_PVQ_U_DATA+1473
 };
 #else
-const opus_uint32 *const CELT_PVQ_U_ROW[15]={
+static const opus_uint32 *const CELT_PVQ_U_ROW[15]={
   CELT_PVQ_U_DATA+   0,CELT_PVQ_U_DATA+ 176,CELT_PVQ_U_DATA+ 351,
   CELT_PVQ_U_DATA+ 525,CELT_PVQ_U_DATA+ 698,CELT_PVQ_U_DATA+ 870,
   CELT_PVQ_U_DATA+1041,CELT_PVQ_U_DATA+1131,CELT_PVQ_U_DATA+1178,
@@ -534,7 +534,7 @@ void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
 /*Computes the next row/column of any recurrence that obeys the relation
    u[i][j]=u[i-1][j]+u[i][j-1]+u[i-1][j-1].
   _ui0 is the base case for the new row/column.*/
-static inline void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){
+static OPUS_INLINE void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){
   opus_uint32 ui1;
   unsigned      j;
   /*This do-while will overrun the array if we don't have storage for at least
@@ -550,7 +550,7 @@ static inline void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){
 /*Computes the previous row/column of any recurrence that obeys the relation
    u[i-1][j]=u[i][j]-u[i][j-1]-u[i-1][j-1].
   _ui0 is the base case for the new row/column.*/
-static inline void uprev(opus_uint32 *_ui,unsigned _n,opus_uint32 _ui0){
+static OPUS_INLINE void uprev(opus_uint32 *_ui,unsigned _n,opus_uint32 _ui0){
   opus_uint32 ui1;
   unsigned      j;
   /*This do-while will overrun the array if we don't have storage for at least
@@ -617,7 +617,7 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
    of size 1 with associated sign bits.
   _y: The vector of pulses, whose sum of absolute values is K.
   _k: Returns K.*/
-static inline opus_uint32 icwrs1(const int *_y,int *_k){
+static OPUS_INLINE opus_uint32 icwrs1(const int *_y,int *_k){
   *_k=abs(_y[0]);
   return _y[0]<0;
 }
@@ -626,7 +626,7 @@ static inline opus_uint32 icwrs1(const int *_y,int *_k){
    of size _n with associated sign bits.
   _y:  The vector of pulses, whose sum of absolute values must be _k.
   _nc: Returns V(_n,_k).*/
-static inline opus_uint32 icwrs(int _n,int _k,opus_uint32 *_nc,const int *_y,
+static OPUS_INLINE opus_uint32 icwrs(int _n,int _k,opus_uint32 *_nc,const int *_y,
  opus_uint32 *_u){
   opus_uint32 i;
   int         j;
diff --git a/celt/ecintrin.h b/celt/ecintrin.h
index be57dd4..2263cff 100644
--- a/celt/ecintrin.h
+++ b/celt/ecintrin.h
@@ -33,7 +33,7 @@
 #if !defined(_ecintrin_H)
 # define _ecintrin_H (1)
 
-/*Some specific platforms may have optimized intrinsic or inline assembly
+/*Some specific platforms may have optimized intrinsic or OPUS_INLINE assembly
    versions of these functions which can substantially improve performance.
   We define macros for them to allow easy incorporation of these non-ANSI
    features.*/
diff --git a/celt/entcode.h b/celt/entcode.h
index aebecc0..dd13e49 100644
--- a/celt/entcode.h
+++ b/celt/entcode.h
@@ -26,6 +26,7 @@
 */
 
 #include "opus_types.h"
+#include "opus_defines.h"
 
 #if !defined(_entcode_H)
 # define _entcode_H (1)
@@ -83,15 +84,15 @@ struct ec_ctx{
    int            error;
 };
 
-static inline opus_uint32 ec_range_bytes(ec_ctx *_this){
+static OPUS_INLINE opus_uint32 ec_range_bytes(ec_ctx *_this){
   return _this->offs;
 }
 
-static inline unsigned char *ec_get_buffer(ec_ctx *_this){
+static OPUS_INLINE unsigned char *ec_get_buffer(ec_ctx *_this){
   return _this->buf;
 }
 
-static inline int ec_get_error(ec_ctx *_this){
+static OPUS_INLINE int ec_get_error(ec_ctx *_this){
   return _this->error;
 }
 
@@ -101,7 +102,7 @@ static inline int ec_get_error(ec_ctx *_this){
   Return: The number of bits.
           This will always be slightly larger than the exact value (e.g., all
            rounding error is in the positive direction).*/
-static inline int ec_tell(ec_ctx *_this){
+static OPUS_INLINE int ec_tell(ec_ctx *_this){
   return _this->nbits_total-EC_ILOG(_this->rng);
 }
 
diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index f11d890..80bc949 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -33,9 +33,9 @@
 #define FIXED_DEBUG_H
 
 #include <stdio.h>
+#include "opus_defines.h"
 
 #ifdef CELT_C
-#include "opus_defines.h"
 OPUS_EXPORT opus_int64 celt_mips=0;
 #else
 extern opus_int64 celt_mips;
@@ -59,7 +59,7 @@ extern opus_int64 celt_mips;
 #define SHR(a,b) SHR32(a,b)
 #define PSHR(a,b) PSHR32(a,b)
 
-static inline short NEG16(int x)
+static OPUS_INLINE short NEG16(int x)
 {
    int res;
    if (!VERIFY_SHORT(x))
@@ -80,7 +80,7 @@ static inline short NEG16(int x)
    celt_mips++;
    return res;
 }
-static inline int NEG32(opus_int64 x)
+static OPUS_INLINE int NEG32(opus_int64 x)
 {
    opus_int64 res;
    if (!VERIFY_INT(x))
@@ -103,7 +103,7 @@ static inline int NEG32(opus_int64 x)
 }
 
 #define EXTRACT16(x) EXTRACT16_(x, __FILE__, __LINE__)
-static inline short EXTRACT16_(int x, char *file, int line)
+static OPUS_INLINE short EXTRACT16_(int x, char *file, int line)
 {
    int res;
    if (!VERIFY_SHORT(x))
@@ -119,7 +119,7 @@ static inline short EXTRACT16_(int x, char *file, int line)
 }
 
 #define EXTEND32(x) EXTEND32_(x, __FILE__, __LINE__)
-static inline int EXTEND32_(int x, char *file, int line)
+static OPUS_INLINE int EXTEND32_(int x, char *file, int line)
 {
    int res;
    if (!VERIFY_SHORT(x))
@@ -135,7 +135,7 @@ static inline int EXTEND32_(int x, char *file, int line)
 }
 
 #define SHR16(a, shift) SHR16_(a, shift, __FILE__, __LINE__)
-static inline short SHR16_(int a, int shift, char *file, int line)
+static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line)
 {
    int res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
@@ -157,7 +157,7 @@ static inline short SHR16_(int a, int shift, char *file, int line)
    return res;
 }
 #define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__)
-static inline short SHL16_(int a, int shift, char *file, int line)
+static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
 {
    int res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
@@ -179,7 +179,7 @@ static inline short SHL16_(int a, int shift, char *file, int line)
    return res;
 }
 
-static inline int SHR32(opus_int64 a, int shift)
+static OPUS_INLINE int SHR32(opus_int64 a, int shift)
 {
    opus_int64  res;
    if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
@@ -201,7 +201,7 @@ static inline int SHR32(opus_int64 a, int shift)
    return res;
 }
 #define SHL32(a, shift) SHL32_(a, shift, __FILE__, __LINE__)
-static inline int SHL32_(opus_int64 a, int shift, char *file, int line)
+static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line)
 {
    opus_int64  res;
    if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
@@ -234,7 +234,7 @@ static inline int SHL32_(opus_int64 a, int shift, char *file, int line)
 //#define SHL(a,shift) ((a) << (shift))
 
 #define ADD16(a, b) ADD16_(a, b, __FILE__, __LINE__)
-static inline short ADD16_(int a, int b, char *file, int line)
+static OPUS_INLINE short ADD16_(int a, int b, char *file, int line)
 {
    int res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -257,7 +257,7 @@ static inline short ADD16_(int a, int b, char *file, int line)
 }
 
 #define SUB16(a, b) SUB16_(a, b, __FILE__, __LINE__)
-static inline short SUB16_(int a, int b, char *file, int line)
+static OPUS_INLINE short SUB16_(int a, int b, char *file, int line)
 {
    int res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -280,7 +280,7 @@ static inline short SUB16_(int a, int b, char *file, int line)
 }
 
 #define ADD32(a, b) ADD32_(a, b, __FILE__, __LINE__)
-static inline int ADD32_(opus_int64 a, opus_int64 b, char *file, int line)
+static OPUS_INLINE int ADD32_(opus_int64 a, opus_int64 b, char *file, int line)
 {
    opus_int64 res;
    if (!VERIFY_INT(a) || !VERIFY_INT(b))
@@ -303,7 +303,7 @@ static inline int ADD32_(opus_int64 a, opus_int64 b, char *file, int line)
 }
 
 #define SUB32(a, b) SUB32_(a, b, __FILE__, __LINE__)
-static inline int SUB32_(opus_int64 a, opus_int64 b, char *file, int line)
+static OPUS_INLINE int SUB32_(opus_int64 a, opus_int64 b, char *file, int line)
 {
    opus_int64 res;
    if (!VERIFY_INT(a) || !VERIFY_INT(b))
@@ -327,7 +327,7 @@ static inline int SUB32_(opus_int64 a, opus_int64 b, char *file, int line)
 
 #undef UADD32
 #define UADD32(a, b) UADD32_(a, b, __FILE__, __LINE__)
-static inline unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file, int line)
+static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file, int line)
 {
    opus_uint64 res;
    if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
@@ -351,7 +351,7 @@ static inline unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file, int
 
 #undef USUB32
 #define USUB32(a, b) USUB32_(a, b, __FILE__, __LINE__)
-static inline unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file, int line)
+static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file, int line)
 {
    opus_uint64 res;
    if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
@@ -381,7 +381,7 @@ static inline unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file, int
 }
 
 /* result fits in 16 bits */
-static inline short MULT16_16_16(int a, int b)
+static OPUS_INLINE short MULT16_16_16(int a, int b)
 {
    int res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -404,7 +404,7 @@ static inline short MULT16_16_16(int a, int b)
 }
 
 #define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__)
-static inline int MULT16_16_(int a, int b, char *file, int line)
+static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line)
 {
    opus_int64 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -429,7 +429,7 @@ static inline int MULT16_16_(int a, int b, char *file, int line)
 #define MAC16_16(c,a,b)     (celt_mips-=2,ADD32((c),MULT16_16((a),(b))))
 
 #define MULT16_32_QX(a, b, Q) MULT16_32_QX_(a, b, Q, __FILE__, __LINE__)
-static inline int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int line)
+static OPUS_INLINE int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int line)
 {
    opus_int64 res;
    if (!VERIFY_SHORT(a) || !VERIFY_INT(b))
@@ -462,7 +462,7 @@ static inline int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int line
 }
 
 #define MULT16_32_PX(a, b, Q) MULT16_32_PX_(a, b, Q, __FILE__, __LINE__)
-static inline int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int line)
+static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int line)
 {
    opus_int64 res;
    if (!VERIFY_SHORT(a) || !VERIFY_INT(b))
@@ -497,7 +497,7 @@ static inline int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int line
 #define MULT16_32_Q15(a,b) MULT16_32_QX(a,b,15)
 #define MAC16_32_Q15(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q15((a),(b))))
 
-static inline int SATURATE(int a, int b)
+static OPUS_INLINE int SATURATE(int a, int b)
 {
    if (a>b)
       a=b;
@@ -507,7 +507,17 @@ static inline int SATURATE(int a, int b)
    return a;
 }
 
-static inline int MULT16_16_Q11_32(int a, int b)
+static OPUS_INLINE opus_int16 SATURATE16(opus_int32 a)
+{
+   celt_mips+=3;
+   if (a>32767)
+      return 32767;
+   else if (a<-32768)
+      return -32768;
+   else return a;
+}
+
+static OPUS_INLINE int MULT16_16_Q11_32(int a, int b)
 {
    opus_int64 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -529,7 +539,7 @@ static inline int MULT16_16_Q11_32(int a, int b)
    celt_mips+=3;
    return res;
 }
-static inline short MULT16_16_Q13(int a, int b)
+static OPUS_INLINE short MULT16_16_Q13(int a, int b)
 {
    opus_int64 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -551,7 +561,7 @@ static inline short MULT16_16_Q13(int a, int b)
    celt_mips+=3;
    return res;
 }
-static inline short MULT16_16_Q14(int a, int b)
+static OPUS_INLINE short MULT16_16_Q14(int a, int b)
 {
    opus_int64 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -575,7 +585,7 @@ static inline short MULT16_16_Q14(int a, int b)
 }
 
 #define MULT16_16_Q15(a, b) MULT16_16_Q15_(a, b, __FILE__, __LINE__)
-static inline short MULT16_16_Q15_(int a, int b, char *file, int line)
+static OPUS_INLINE short MULT16_16_Q15_(int a, int b, char *file, int line)
 {
    opus_int64 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -598,7 +608,7 @@ static inline short MULT16_16_Q15_(int a, int b, char *file, int line)
    return res;
 }
 
-static inline short MULT16_16_P13(int a, int b)
+static OPUS_INLINE short MULT16_16_P13(int a, int b)
 {
    opus_int64 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -628,7 +638,7 @@ static inline short MULT16_16_P13(int a, int b)
    celt_mips+=4;
    return res;
 }
-static inline short MULT16_16_P14(int a, int b)
+static OPUS_INLINE short MULT16_16_P14(int a, int b)
 {
    opus_int64 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -658,7 +668,7 @@ static inline short MULT16_16_P14(int a, int b)
    celt_mips+=4;
    return res;
 }
-static inline short MULT16_16_P15(int a, int b)
+static OPUS_INLINE short MULT16_16_P15(int a, int b)
 {
    opus_int64 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b))
@@ -691,7 +701,7 @@ static inline short MULT16_16_P15(int a, int b)
 
 #define DIV32_16(a, b) DIV32_16_(a, b, __FILE__, __LINE__)
 
-static inline int DIV32_16_(opus_int64 a, opus_int64 b, char *file, int line)
+static OPUS_INLINE int DIV32_16_(opus_int64 a, opus_int64 b, char *file, int line)
 {
    opus_int64 res;
    if (b==0)
@@ -726,7 +736,7 @@ static inline int DIV32_16_(opus_int64 a, opus_int64 b, char *file, int line)
 }
 
 #define DIV32(a, b) DIV32_(a, b, __FILE__, __LINE__)
-static inline int DIV32_(opus_int64 a, opus_int64 b, char *file, int line)
+static OPUS_INLINE int DIV32_(opus_int64 a, opus_int64 b, char *file, int line)
 {
    opus_int64 res;
    if (b==0)
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index 657e67c..ecf018a 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -40,7 +40,7 @@
 #define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16))
 
 /** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */
-#define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16((a),((b)&0x0000ffff)),16))
+#define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16SU((a),((b)&0x0000ffff)),16))
 
 /** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
 #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
diff --git a/celt/float_cast.h b/celt/float_cast.h
index 5ded291..ede6574 100644
--- a/celt/float_cast.h
+++ b/celt/float_cast.h
@@ -101,7 +101,7 @@
         #include <math.h>
 
         /*      Win32 doesn't seem to have these functions.
-        **      Therefore implement inline versions of these functions here.
+        **      Therefore implement OPUS_INLINE versions of these functions here.
         */
 
         __inline long int
@@ -128,7 +128,7 @@
 #endif
 
 #ifndef DISABLE_FLOAT_API
-static inline opus_int16 FLOAT2INT16(float x)
+static OPUS_INLINE opus_int16 FLOAT2INT16(float x)
 {
    x = x*CELT_SIG_SCALE;
    x = MAX32(x, -32768);
diff --git a/celt/mathops.c b/celt/mathops.c
index 21fd942..3f8c5dc 100644
--- a/celt/mathops.c
+++ b/celt/mathops.c
@@ -139,7 +139,7 @@ opus_val32 celt_sqrt(opus_val32 x)
 #define L3 8277
 #define L4 -626
 
-static inline opus_val16 _celt_cos_pi_2(opus_val16 x)
+static OPUS_INLINE opus_val16 _celt_cos_pi_2(opus_val16 x)
 {
    opus_val16 x2;
 
diff --git a/celt/mathops.h b/celt/mathops.h
index 7e7d906..a0525a9 100644
--- a/celt/mathops.h
+++ b/celt/mathops.h
@@ -44,7 +44,7 @@
 unsigned isqrt32(opus_uint32 _val);
 
 #ifndef OVERRIDE_CELT_MAXABS16
-static inline opus_val32 celt_maxabs16(const opus_val16 *x, int len)
+static OPUS_INLINE opus_val32 celt_maxabs16(const opus_val16 *x, int len)
 {
    int i;
    opus_val16 maxval = 0;
@@ -60,7 +60,7 @@ static inline opus_val32 celt_maxabs16(const opus_val16 *x, int len)
 
 #ifndef OVERRIDE_CELT_MAXABS32
 #ifdef FIXED_POINT
-static inline opus_val32 celt_maxabs32(const opus_val32 *x, int len)
+static OPUS_INLINE opus_val32 celt_maxabs32(const opus_val32 *x, int len)
 {
    int i;
    opus_val32 maxval = 0;
@@ -95,7 +95,7 @@ static inline opus_val32 celt_maxabs32(const opus_val32 *x, int len)
          denorm, +/- inf and NaN are *not* handled */
 
 /** Base-2 log approximation (log2(x)). */
-static inline float celt_log2(float x)
+static OPUS_INLINE float celt_log2(float x)
 {
    int integer;
    float frac;
@@ -113,7 +113,7 @@ static inline float celt_log2(float x)
 }
 
 /** Base-2 exponential approximation (2^x). */
-static inline float celt_exp2(float x)
+static OPUS_INLINE float celt_exp2(float x)
 {
    int integer;
    float frac;
@@ -145,7 +145,7 @@ static inline float celt_exp2(float x)
 
 #ifndef OVERRIDE_CELT_ILOG2
 /** Integer log in base2. Undefined for zero and negative numbers */
-static inline opus_int16 celt_ilog2(opus_int32 x)
+static OPUS_INLINE opus_int16 celt_ilog2(opus_int32 x)
 {
    celt_assert2(x>0, "celt_ilog2() only defined for strictly positive numbers");
    return EC_ILOG(x)-1;
@@ -154,7 +154,7 @@ static inline opus_int16 celt_ilog2(opus_int32 x)
 
 
 /** Integer log in base2. Defined for zero, but not for negative numbers */
-static inline opus_int16 celt_zlog2(opus_val32 x)
+static OPUS_INLINE opus_int16 celt_zlog2(opus_val32 x)
 {
    return x <= 0 ? 0 : celt_ilog2(x);
 }
@@ -166,7 +166,7 @@ opus_val32 celt_sqrt(opus_val32 x);
 opus_val16 celt_cos_norm(opus_val32 x);
 
 /** Base-2 logarithm approximation (log2(x)). (Q14 input, Q10 output) */
-static inline opus_val16 celt_log2(opus_val32 x)
+static OPUS_INLINE opus_val16 celt_log2(opus_val32 x)
 {
    int i;
    opus_val16 n, frac;
@@ -192,14 +192,14 @@ static inline opus_val16 celt_log2(opus_val32 x)
 #define D2 14819
 #define D3 10204
 
-static inline opus_val32 celt_exp2_frac(opus_val16 x)
+static OPUS_INLINE opus_val32 celt_exp2_frac(opus_val16 x)
 {
    opus_val16 frac;
    frac = SHL16(x, 4);
    return ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac))))));
 }
 /** Base-2 exponential approximation (2^x). (Q10 input, Q16 output) */
-static inline opus_val32 celt_exp2(opus_val16 x)
+static OPUS_INLINE opus_val32 celt_exp2(opus_val16 x)
 {
    int integer;
    opus_val16 frac;
@@ -225,7 +225,7 @@ opus_val32 frac_div32(opus_val32 a, opus_val32 b);
 
 /* Atan approximation using a 4th order polynomial. Input is in Q15 format
    and normalized by pi/4. Output is in Q15 format */
-static inline opus_val16 celt_atan01(opus_val16 x)
+static OPUS_INLINE opus_val16 celt_atan01(opus_val16 x)
 {
    return MULT16_16_P15(x, ADD32(M1, MULT16_16_P15(x, ADD32(M2, MULT16_16_P15(x, ADD32(M3, MULT16_16_P15(M4, x)))))));
 }
@@ -236,7 +236,7 @@ static inline opus_val16 celt_atan01(opus_val16 x)
 #undef M4
 
 /* atan2() approximation valid for positive input values */
-static inline opus_val16 celt_atan2p(opus_val16 y, opus_val16 x)
+static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x)
 {
    if (y < x)
    {
diff --git a/celt/os_support.h b/celt/os_support.h
index 2484f0b..5e47e3c 100644
--- a/celt/os_support.h
+++ b/celt/os_support.h
@@ -35,13 +35,16 @@
 #  include "custom_support.h"
 #endif
 
+#include "opus_types.h"
+#include "opus_defines.h"
+
 #include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
 
 /** Opus wrapper for malloc(). To do your own dynamic allocation, all you need to do is replace this function and opus_free */
 #ifndef OVERRIDE_OPUS_ALLOC
-static inline void *opus_alloc (size_t size)
+static OPUS_INLINE void *opus_alloc (size_t size)
 {
    return malloc(size);
 }
@@ -49,7 +52,7 @@ static inline void *opus_alloc (size_t size)
 
 /** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */
 #ifndef OVERRIDE_OPUS_ALLOC_SCRATCH
-static inline void *opus_alloc_scratch (size_t size)
+static OPUS_INLINE void *opus_alloc_scratch (size_t size)
 {
    /* Scratch space doesn't need to be cleared */
    return opus_alloc(size);
@@ -58,7 +61,7 @@ static inline void *opus_alloc_scratch (size_t size)
 
 /** Opus wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and opus_alloc */
 #ifndef OVERRIDE_OPUS_FREE
-static inline void opus_free (void *ptr)
+static OPUS_INLINE void opus_free (void *ptr)
 {
    free(ptr);
 }
diff --git a/celt/pitch.c b/celt/pitch.c
index 0352b30..d2b3054 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -145,7 +145,7 @@ static void celt_fir5(const opus_val16 *x,
 
 
 void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
-      int len, int C)
+      int len, int C, int arch)
 {
    int i;
    opus_val32 ac[5];
@@ -180,7 +180,7 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
    }
 
    _celt_autocorr(x_lp, ac, NULL, 0,
-                  4, len>>1);
+                  4, len>>1, arch);
 
    /* Noise floor -40 dB */
 #ifdef FIXED_POINT
@@ -250,9 +250,14 @@ opus_val32
 #else
 void
 #endif
-celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
 {
    int i,j;
+   /*The EDSP version requires that max_pitch is at least 1, and that _x is
+      32-bit aligned.
+     Since it's hard to put asserts in assembly, put them here.*/
+   celt_assert(max_pitch>0);
+   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
 #ifdef FIXED_POINT
    opus_val32 maxcorr=1;
 #endif
@@ -289,7 +294,7 @@ celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr,
 
 #endif
 void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
-                  int len, int max_pitch, int *pitch)
+                  int len, int max_pitch, int *pitch, int arch)
 {
    int i, j;
    int lag;
@@ -342,7 +347,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
 #ifdef FIXED_POINT
    maxcorr =
 #endif
-   celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);
+   celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2, arch);
 
    find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch
 #ifdef FIXED_POINT
diff --git a/celt/pitch.h b/celt/pitch.h
index caffd24..df317ec 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -35,16 +35,21 @@
 #define PITCH_H
 
 #include "modes.h"
+#include "cpu_support.h"
 
 #if defined(__SSE__) && !defined(FIXED_POINT)
 #include "x86/pitch_sse.h"
 #endif
 
+#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+# include "arm/pitch_arm.h"
+#endif
+
 void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
-      int len, int C);
+      int len, int C, int arch);
 
 void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
-                  int len, int max_pitch, int *pitch);
+                  int len, int max_pitch, int *pitch, int arch);
 
 opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
       int N, int *T0, int prev_period, opus_val16 prev_gain);
@@ -52,10 +57,11 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
 /* OPT: This is the kernel you really want to optimize. It gets used a lot
    by the prefilter and by the PLC. */
 #ifndef OVERRIDE_XCORR_KERNEL
-static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
+static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
 {
    int j;
    opus_val16 y_0, y_1, y_2, y_3;
+   celt_assert(len>=3);
    y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
    y_0=*y++;
    y_1=*y++;
@@ -119,7 +125,7 @@ static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus
 #endif /* OVERRIDE_XCORR_KERNEL */
 
 #ifndef OVERRIDE_DUAL_INNER_PROD
-static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
       int N, opus_val32 *xy1, opus_val32 *xy2)
 {
    int i;
@@ -140,6 +146,28 @@ opus_val32
 #else
 void
 #endif
-celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch);
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
+      opus_val32 *xcorr, int len, int max_pitch);
+
+#if !defined(OVERRIDE_PITCH_XCORR)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD)
+extern
+#  if defined(FIXED_POINT)
+opus_val32
+#  else
+void
+#  endif
+(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val32 *, int, int);
+
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch))
+# else
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch))
+# endif
+#endif
 
 #endif
diff --git a/celt/quant_bands.c b/celt/quant_bands.c
index 79685e1..ac6952c 100644
--- a/celt/quant_bands.c
+++ b/celt/quant_bands.c
@@ -312,6 +312,7 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
       opus_int32 tell_intra;
       opus_uint32 nstart_bytes;
       opus_uint32 nintra_bytes;
+      opus_uint32 save_bytes;
       int badness2;
       VARDECL(unsigned char, intra_bits);
 
@@ -322,7 +323,10 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
       nstart_bytes = ec_range_bytes(&enc_start_state);
       nintra_bytes = ec_range_bytes(&enc_intra_state);
       intra_buf = ec_get_buffer(&enc_intra_state) + nstart_bytes;
-      ALLOC(intra_bits, nintra_bytes-nstart_bytes, unsigned char);
+      save_bytes = nintra_bytes-nstart_bytes;
+      if (save_bytes == 0)
+         save_bytes = ALLOC_NONE;
+      ALLOC(intra_bits, save_bytes, unsigned char);
       /* Copy bits from intra bit-stream */
       OPUS_COPY(intra_bits, intra_buf, nintra_bytes - nstart_bytes);
 
diff --git a/celt/rate.c b/celt/rate.c
index e474cf5..e13d839 100644
--- a/celt/rate.c
+++ b/celt/rate.c
@@ -245,7 +245,7 @@ void compute_pulse_cache(CELTMode *m, int LM)
 
 #define ALLOC_STEPS 6
 
-static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int skip_start,
+static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end, int skip_start,
       const int *bits1, const int *bits2, const int *thresh, const int *cap, opus_int32 total, opus_int32 *_balance,
       int skip_rsv, int *intensity, int intensity_rsv, int *dual_stereo, int dual_stereo_rsv, int *bits,
       int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth)
diff --git a/celt/rate.h b/celt/rate.h
index 263fde9..f1e0661 100644
--- a/celt/rate.h
+++ b/celt/rate.h
@@ -45,12 +45,12 @@
 
 void compute_pulse_cache(CELTMode *m, int LM);
 
-static inline int get_pulses(int i)
+static OPUS_INLINE int get_pulses(int i)
 {
    return i<8 ? i : (8 + (i&7)) << ((i>>3)-1);
 }
 
-static inline int bits2pulses(const CELTMode *m, int band, int LM, int bits)
+static OPUS_INLINE int bits2pulses(const CELTMode *m, int band, int LM, int bits)
 {
    int i;
    int lo, hi;
@@ -77,7 +77,7 @@ static inline int bits2pulses(const CELTMode *m, int band, int LM, int bits)
       return hi;
 }
 
-static inline int pulses2bits(const CELTMode *m, int band, int LM, int pulses)
+static OPUS_INLINE int pulses2bits(const CELTMode *m, int band, int LM, int pulses)
 {
    const unsigned char *cache;
 
diff --git a/celt/stack_alloc.h b/celt/stack_alloc.h
index 1c093a8..316a6ce 100644
--- a/celt/stack_alloc.h
+++ b/celt/stack_alloc.h
@@ -32,6 +32,9 @@
 #ifndef STACK_ALLOC_H
 #define STACK_ALLOC_H
 
+#include "opus_types.h"
+#include "opus_defines.h"
+
 #if (!defined (VAR_ARRAYS) && !defined (USE_ALLOCA) && !defined (NONTHREADSAFE_PSEUDOSTACK))
 #error "Opus requires one of VAR_ARRAYS, USE_ALLOCA, or NONTHREADSAFE_PSEUDOSTACK be defined to select the temporary allocation mode."
 #endif
@@ -92,6 +95,8 @@
 #define SAVE_STACK
 #define RESTORE_STACK
 #define ALLOC_STACK
+/* C99 does not allow VLAs of size zero */
+#define ALLOC_NONE 1
 
 #elif defined(USE_ALLOCA)
 
@@ -106,6 +111,7 @@
 #define SAVE_STACK
 #define RESTORE_STACK
 #define ALLOC_STACK
+#define ALLOC_NONE 0
 
 #else
 
@@ -143,6 +149,7 @@ extern char *global_stack_top;
 #define VARDECL(type, var) type *var
 #define ALLOC(var, size, type) var = PUSH(global_stack, size, type)
 #define SAVE_STACK char *_saved_stack = global_stack;
+#define ALLOC_NONE 0
 
 #endif /* VAR_ARRAYS */
 
@@ -159,7 +166,7 @@ extern char *global_stack_top;
 
 #else
 
-static inline int _opus_false(void) {return 0;}
+static OPUS_INLINE int _opus_false(void) {return 0;}
 #define OPUS_CHECK_ARRAY(ptr, len) _opus_false()
 #define OPUS_CHECK_VALUE(value) _opus_false()
 #define OPUS_PRINT_INT(value) do{}while(0)
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index 63ae3d4..695122a 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -36,7 +36,7 @@
 #include "arch.h"
 
 #define OVERRIDE_XCORR_KERNEL
-static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
+static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
 {
    int j;
    __m128 xsum1, xsum2;
@@ -72,7 +72,7 @@ static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_v
 }
 
 #define OVERRIDE_DUAL_INNER_PROD
-static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
       int N, opus_val32 *xy1, opus_val32 *xy2)
 {
    int i;
@@ -102,7 +102,7 @@ static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, c
 }
 
 #define OVERRIDE_COMB_FILTER_CONST
-static inline void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
       opus_val16 g10, opus_val16 g11, opus_val16 g12)
 {
    int i;
-- 
cgit v1.2.3