Updating Opus to a pre-release of 1.1

This CL updates Opus to a pre-release of the coming Opus 1.1 version. The code is extracted from http://git.xiph.org/?p=opus.git, commit aee4d8057632ea0cfc1d55d88acf8466b47b7b4b from October 1st 2013. This version includes both algorithmic and platform optimizations, as well an important fix for a denorm problem when the input goes silent after active audio. The problem causes high CPU usage. Review URL: https://codereview.chromium.org/28553003 git-svn-id: svn://svn.chromium.org/chrome/trunk/deps/third_party/opus@230378 0039d316-1c4b-4281-b951-d872f2087c98
author: tlegrand@chromium.org <tlegrand@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-10-23 09:13:50 +0000
committer: tlegrand@chromium.org <tlegrand@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-10-23 09:13:50 +0000
commit: e3ea049fcaee2247e45f0ce793d4313babb4ef69 (patch)
tree: 4449fa158c45dea4175443f44c3dfc1264bd6dfb /celt
parent: 6b6bee25314cfac02cc555cddedb9680c63a26d6 (diff)
download: src-e3ea049fcaee2247e45f0ce793d4313babb4ef69.tar.gz
41 files changed, 6704 insertions, 3838 deletions
diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h
index 33e62c6..f0c6976 100644
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -94,6 +94,14 @@
     do {(res).r = ADD32((res).r,(a).r);  (res).i = SUB32((res).i,(a).i); \
     }while(0)
 
+#if defined(ARMv4_ASM)
+#include "arm/kiss_fft_armv4.h"
+#endif
+
+#if defined(ARMv5E_ASM)
+#include "arm/kiss_fft_armv5e.h"
+#endif
+
 #else  /* not FIXED_POINT*/
 
 #   define S_MUL(a,b) ( (a)*(b) )
diff --git a/celt/arch.h b/celt/arch.h
index 03cda40..f9c9856 100644
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -100,6 +100,7 @@ typedef opus_val32 celt_ener;
 #define DB_SHIFT 10
 
 #define EPSILON 1
+#define VERY_SMALL 0
 #define VERY_LARGE16 ((opus_val16)32767)
 #define Q15_ONE ((opus_val16)32767)
 
@@ -112,10 +113,10 @@ typedef opus_val32 celt_ener;
 
 #include "fixed_generic.h"
 
-#ifdef ARM5E_ASM
-#include "fixed_arm5e.h"
-#elif defined (ARM4_ASM)
-#include "fixed_arm4.h"
+#ifdef ARMv5E_ASM
+#include "arm/fixed_armv5e.h"
+#elif defined (ARMv4_ASM)
+#include "arm/fixed_armv4.h"
 #elif defined (BFIN_ASM)
 #include "fixed_bfin.h"
 #elif defined (TI_C5X_ASM)
@@ -140,6 +141,7 @@ typedef float celt_ener;
 #define NORM_SCALING 1.f
 
 #define EPSILON 1e-15f
+#define VERY_SMALL 1e-30f
 #define VERY_LARGE16 1e15f
 #define Q15_ONE ((opus_val16)1.f)
 
@@ -161,6 +163,7 @@ typedef float celt_ener;
 #define SHR(a,shift)    (a)
 #define SHL(a,shift)    (a)
 #define SATURATE(x,a)   (x)
+#define SATURATE16(x)   (x)
 
 #define ROUND16(a,shift)  (a)
 #define HALF16(x)       (.5f*(x))
@@ -182,6 +185,7 @@ typedef float celt_ener;
 #define MAC16_32_Q15(c,a,b)     ((c)+(a)*(b))
 
 #define MULT16_16_Q11_32(a,b)     ((a)*(b))
+#define MULT16_16_Q11(a,b)     ((a)*(b))
 #define MULT16_16_Q13(a,b)     ((a)*(b))
 #define MULT16_16_Q14(a,b)     ((a)*(b))
 #define MULT16_16_Q15(a,b)     ((a)*(b))
diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
new file mode 100644
index 0000000..aabcc71
--- /dev/null
+++ b/celt/arm/armcpu.c
@@ -0,0 +1,166 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Original code from libtheora modified to suit to Opus */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef OPUS_HAVE_RTCD
+
+#include "armcpu.h"
+#include "cpu_support.h"
+#include "os_support.h"
+#include "opus_types.h"
+
+#define OPUS_CPU_ARM_V4    (1)
+#define OPUS_CPU_ARM_EDSP  (1<<1)
+#define OPUS_CPU_ARM_MEDIA (1<<2)
+#define OPUS_CPU_ARM_NEON  (1<<3)
+
+#if defined(_MSC_VER)
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+# define WIN32_LEAN_AND_MEAN
+# define WIN32_EXTRA_LEAN
+# include <windows.h>
+
+static inline opus_uint32 opus_cpu_capabilities(void){
+  opus_uint32 flags;
+  flags=0;
+  /* MSVC has no inline __asm support for ARM, but it does let you __emit
+   * instructions via their assembled hex code.
+   * All of these instructions should be essentially nops. */
+# if defined(ARMv5E_ASM)
+  __try{
+    /*PLD [r13]*/
+    __emit(0xF5DDF000);
+    flags|=OPUS_CPU_ARM_EDSP;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#  if defined(ARMv6E_ASM)
+  __try{
+    /*SHADD8 r3,r3,r3*/
+    __emit(0xE6333F93);
+    flags|=OPUS_CPU_ARM_MEDIA;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   if defined(ARM_HAVE_NEON)
+  __try{
+    /*VORR q0,q0,q0*/
+    __emit(0xF2200150);
+    flags|=OPUS_CPU_ARM_NEON;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   endif
+#  endif
+# endif
+  return flags;
+}
+
+#elif defined(__linux__)
+/* Linux based */
+opus_uint32 opus_cpu_capabilities(void)
+{
+  opus_uint32 flags = 0;
+  FILE *cpuinfo;
+
+  /* Reading /proc/self/auxv would be easier, but that doesn't work reliably on
+   * Android */
+  cpuinfo = fopen("/proc/cpuinfo", "r");
+
+  if(cpuinfo != NULL)
+  {
+    /* 512 should be enough for anybody (it's even enough for all the flags that
+     * x86 has accumulated... so far). */
+    char buf[512];
+
+    while(fgets(buf, 512, cpuinfo) != NULL)
+    {
+      /* Search for edsp and neon flag */
+      if(memcmp(buf, "Features", 8) == 0)
+      {
+        char *p;
+        p = strstr(buf, " edsp");
+        if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
+          flags |= OPUS_CPU_ARM_EDSP;
+
+        p = strstr(buf, " neon");
+        if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
+          flags |= OPUS_CPU_ARM_NEON;
+      }
+
+      /* Search for media capabilities (>= ARMv6) */
+      if(memcmp(buf, "CPU architecture:", 17) == 0)
+      {
+        int version;
+        version = atoi(buf+17);
+
+        if(version >= 6)
+          flags |= OPUS_CPU_ARM_MEDIA;
+      }
+    }
+
+    fclose(cpuinfo);
+  }
+  return flags;
+}
+#else
+/* The feature registers which can tell us what the processor supports are
+ * accessible in priveleged modes only, so we can't have a general user-space
+ * detection method like on x86.*/
+# error "Configured to use ARM asm but no CPU detection method available for " \
+   "your platform.  Reconfigure with --disable-rtcd (or send patches)."
+#endif
+
+int opus_select_arch(void)
+{
+  opus_uint32 flags = opus_cpu_capabilities();
+  int arch = 0;
+
+  if(!(flags & OPUS_CPU_ARM_EDSP))
+    return arch;
+  arch++;
+
+  if(!(flags & OPUS_CPU_ARM_MEDIA))
+    return arch;
+  arch++;
+
+  if(!(flags & OPUS_CPU_ARM_NEON))
+    return arch;
+  arch++;
+
+  return arch;
+}
+
+#endif
diff --git a/celt/arm/armcpu.h b/celt/arm/armcpu.h
new file mode 100644
index 0000000..68d80fe
--- /dev/null
+++ b/celt/arm/armcpu.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Original code from libtheora modified to suit to Opus */
+
+#ifndef ARMCPU_H
+#define ARMCPU_H
+
+int opus_select_arch(void);
+
+#endif
diff --git a/celt/arm/fixed_armv4.h b/celt/arm/fixed_armv4.h
new file mode 100644
index 0000000..bcacc34
--- /dev/null
+++ b/celt/arm/fixed_armv4.h
@@ -0,0 +1,76 @@
+/* Copyright (C) 2013 Xiph.Org Foundation and contributors */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_ARMv4_H
+#define FIXED_ARMv4_H
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q16
+static inline opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
+{
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+      "#MULT16_32_Q16\n\t"
+      "smull %0, %1, %2, %3\n\t"
+      : "=&r"(rd_lo), "=&r"(rd_hi)
+      : "%r"(b),"r"(a<<16)
+  );
+  return rd_hi;
+}
+#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv4(a, b))
+
+
+/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q15
+static inline opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
+{
+  unsigned rd_lo;
+  int rd_hi;
+  __asm__(
+      "#MULT16_32_Q15\n\t"
+      "smull %0, %1, %2, %3\n\t"
+      : "=&r"(rd_lo), "=&r"(rd_hi)
+      : "%r"(b), "r"(a<<16)
+  );
+  /*We intentionally don't OR in the high bit of rd_lo for speed.*/
+  return rd_hi<<1;
+}
+#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b))
+
+
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+    b must fit in 31 bits.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q15
+#define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))
+
+
+/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
+#undef MULT32_32_Q31
+#define MULT32_32_Q31(a,b) (opus_val32)((((opus_int64)(a)) * ((opus_int64)(b)))>>31)
+
+#endif
diff --git a/celt/arm/fixed_armv5e.h b/celt/arm/fixed_armv5e.h
new file mode 100644
index 0000000..80632c4
--- /dev/null
+++ b/celt/arm/fixed_armv5e.h
@@ -0,0 +1,116 @@
+/* Copyright (C) 2007-2009 Xiph.Org Foundation
+   Copyright (C) 2003-2008 Jean-Marc Valin
+   Copyright (C) 2007-2008 CSIRO
+   Copyright (C) 2013      Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_ARMv5E_H
+#define FIXED_ARMv5E_H
+
+#include "fixed_armv4.h"
+
+/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q16
+static inline opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b)
+{
+  int res;
+  __asm__(
+      "#MULT16_32_Q16\n\t"
+      "smulwb %0, %1, %2\n\t"
+      : "=r"(res)
+      : "r"(b),"r"(a)
+  );
+  return res;
+}
+#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv5e(a, b))
+
+
+/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
+#undef MULT16_32_Q15
+static inline opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
+{
+  int res;
+  __asm__(
+      "#MULT16_32_Q15\n\t"
+      "smulwb %0, %1, %2\n\t"
+      : "=r"(res)
+      : "r"(b), "r"(a)
+  );
+  return res<<1;
+}
+#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
+
+
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+    b must fit in 31 bits.
+    Result fits in 32 bits. */
+#undef MAC16_32_Q15
+static inline opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
+ opus_val32 b)
+{
+  int res;
+  __asm__(
+      "#MAC16_32_Q15\n\t"
+      "smlawb %0, %1, %2, %3;\n"
+      : "=r"(res)
+      : "r"(b<<1), "r"(a), "r"(c)
+  );
+  return res;
+}
+#define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))
+
+/** 16x16 multiply-add where the result fits in 32 bits */
+#undef MAC16_16
+static inline opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
+ opus_val16 b)
+{
+  int res;
+  __asm__(
+      "#MAC16_16\n\t"
+      "smlabb %0, %1, %2, %3;\n"
+      : "=r"(res)
+      : "r"(a), "r"(b), "r"(c)
+  );
+  return res;
+}
+#define MAC16_16(c, a, b) (MAC16_16_armv5e(c, a, b))
+
+/** 16x16 multiplication where the result fits in 32 bits */
+#undef MULT16_16
+static inline opus_val32 MULT16_16_armv5e(opus_val16 a, opus_val16 b)
+{
+  int res;
+  __asm__(
+      "#MULT16_16\n\t"
+      "smulbb %0, %1, %2;\n"
+      : "=r"(res)
+      : "r"(a), "r"(b)
+  );
+  return res;
+}
+#define MULT16_16(a, b) (MULT16_16_armv5e(a, b))
+
+#endif
diff --git a/celt/arm/kiss_fft_armv4.h b/celt/arm/kiss_fft_armv4.h
new file mode 100644
index 0000000..e4faad6
--- /dev/null
+++ b/celt/arm/kiss_fft_armv4.h
@@ -0,0 +1,121 @@
+/*Copyright (c) 2013, Xiph.Org Foundation and contributors.
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifndef KISS_FFT_ARMv4_H
+#define KISS_FFT_ARMv4_H
+
+#if !defined(KISS_FFT_GUTS_H)
+#error "This file should only be included from _kiss_fft_guts.h"
+#endif
+
+#ifdef FIXED_POINT
+
+#undef C_MUL
+#define C_MUL(m,a,b) \
+    do{ \
+       int br__; \
+       int bi__; \
+       int tt__; \
+        __asm__ __volatile__( \
+            "#C_MUL\n\t" \
+            "ldrsh %[br], [%[bp], #0]\n\t" \
+            "ldm %[ap], {r0,r1}\n\t" \
+            "ldrsh %[bi], [%[bp], #2]\n\t" \
+            "smull %[tt], %[mi], r1, %[br]\n\t" \
+            "smlal %[tt], %[mi], r0, %[bi]\n\t" \
+            "rsb %[bi], %[bi], #0\n\t" \
+            "smull %[br], %[mr], r0, %[br]\n\t" \
+            "mov %[tt], %[tt], lsr #15\n\t" \
+            "smlal %[br], %[mr], r1, %[bi]\n\t" \
+            "orr %[mi], %[tt], %[mi], lsl #17\n\t" \
+            "mov %[br], %[br], lsr #15\n\t" \
+            "orr %[mr], %[br], %[mr], lsl #17\n\t" \
+            : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+              [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+            : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+            : "r0", "r1" \
+        ); \
+    } \
+    while(0)
+
+#undef C_MUL4
+#define C_MUL4(m,a,b) \
+    do{ \
+       int br__; \
+       int bi__; \
+       int tt__; \
+        __asm__ __volatile__( \
+            "#C_MUL4\n\t" \
+            "ldrsh %[br], [%[bp], #0]\n\t" \
+            "ldm %[ap], {r0,r1}\n\t" \
+            "ldrsh %[bi], [%[bp], #2]\n\t" \
+            "smull %[tt], %[mi], r1, %[br]\n\t" \
+            "smlal %[tt], %[mi], r0, %[bi]\n\t" \
+            "rsb %[bi], %[bi], #0\n\t" \
+            "smull %[br], %[mr], r0, %[br]\n\t" \
+            "mov %[tt], %[tt], lsr #17\n\t" \
+            "smlal %[br], %[mr], r1, %[bi]\n\t" \
+            "orr %[mi], %[tt], %[mi], lsl #15\n\t" \
+            "mov %[br], %[br], lsr #17\n\t" \
+            "orr %[mr], %[br], %[mr], lsl #15\n\t" \
+            : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+              [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+            : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+            : "r0", "r1" \
+        ); \
+    } \
+    while(0)
+
+#undef C_MULC
+#define C_MULC(m,a,b) \
+    do{ \
+       int br__; \
+       int bi__; \
+       int tt__; \
+        __asm__ __volatile__( \
+            "#C_MULC\n\t" \
+            "ldrsh %[br], [%[bp], #0]\n\t" \
+            "ldm %[ap], {r0,r1}\n\t" \
+            "ldrsh %[bi], [%[bp], #2]\n\t" \
+            "smull %[tt], %[mr], r0, %[br]\n\t" \
+            "smlal %[tt], %[mr], r1, %[bi]\n\t" \
+            "rsb %[bi], %[bi], #0\n\t" \
+            "smull %[br], %[mi], r1, %[br]\n\t" \
+            "mov %[tt], %[tt], lsr #15\n\t" \
+            "smlal %[br], %[mi], r0, %[bi]\n\t" \
+            "orr %[mr], %[tt], %[mr], lsl #17\n\t" \
+            "mov %[br], %[br], lsr #15\n\t" \
+            "orr %[mi], %[br], %[mi], lsl #17\n\t" \
+            : [mr]"=r"((m).r), [mi]"=r"((m).i), \
+              [br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
+            : [ap]"r"(&(a)), [bp]"r"(&(b)) \
+            : "r0", "r1" \
+        ); \
+    } \
+    while(0)
+
+#endif /* FIXED_POINT */
+
+#endif /* KISS_FFT_ARMv4_H */
diff --git a/celt/arm/kiss_fft_armv5e.h b/celt/arm/kiss_fft_armv5e.h
new file mode 100644
index 0000000..9eca183
--- /dev/null
+++ b/celt/arm/kiss_fft_armv5e.h
@@ -0,0 +1,118 @@
+/*Copyright (c) 2013, Xiph.Org Foundation and contributors.
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifndef KISS_FFT_ARMv5E_H
+#define KISS_FFT_ARMv5E_H
+
+#if !defined(KISS_FFT_GUTS_H)
+#error "This file should only be included from _kiss_fft_guts.h"
+#endif
+
+#ifdef FIXED_POINT
+
+#if defined(__thumb__)||defined(__thumb2__)
+#define LDRD_CONS "Q"
+#else
+#define LDRD_CONS "Uq"
+#endif
+
+#undef C_MUL
+#define C_MUL(m,a,b) \
+    do{ \
+        int mr1__; \
+        int mr2__; \
+        int mi__; \
+        long long aval__; \
+        int bval__; \
+        __asm__( \
+            "#C_MUL\n\t" \
+            "ldrd %[aval], %H[aval], %[ap]\n\t" \
+            "ldr %[bval], %[bp]\n\t" \
+            "smulwb %[mi], %H[aval], %[bval]\n\t" \
+            "smulwb %[mr1], %[aval], %[bval]\n\t" \
+            "smulwt %[mr2], %H[aval], %[bval]\n\t" \
+            "smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
+            : [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
+              [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+            : [ap]LDRD_CONS(a), [bp]"m"(b) \
+        ); \
+        (m).r = SHL32(SUB32(mr1__, mr2__), 1); \
+        (m).i = SHL32(mi__, 1); \
+    } \
+    while(0)
+
+#undef C_MUL4
+#define C_MUL4(m,a,b) \
+    do{ \
+        int mr1__; \
+        int mr2__; \
+        int mi__; \
+        long long aval__; \
+        int bval__; \
+        __asm__( \
+            "#C_MUL4\n\t" \
+            "ldrd %[aval], %H[aval], %[ap]\n\t" \
+            "ldr %[bval], %[bp]\n\t" \
+            "smulwb %[mi], %H[aval], %[bval]\n\t" \
+            "smulwb %[mr1], %[aval], %[bval]\n\t" \
+            "smulwt %[mr2], %H[aval], %[bval]\n\t" \
+            "smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
+            : [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
+              [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+            : [ap]LDRD_CONS(a), [bp]"m"(b) \
+        ); \
+        (m).r = SHR32(SUB32(mr1__, mr2__), 1); \
+        (m).i = SHR32(mi__, 1); \
+    } \
+    while(0)
+
+#undef C_MULC
+#define C_MULC(m,a,b) \
+    do{ \
+        int mr__; \
+        int mi1__; \
+        int mi2__; \
+        long long aval__; \
+        int bval__; \
+        __asm__( \
+            "#C_MULC\n\t" \
+            "ldrd %[aval], %H[aval], %[ap]\n\t" \
+            "ldr %[bval], %[bp]\n\t" \
+            "smulwb %[mr], %[aval], %[bval]\n\t" \
+            "smulwb %[mi1], %H[aval], %[bval]\n\t" \
+            "smulwt %[mi2], %[aval], %[bval]\n\t" \
+            "smlawt %[mr], %H[aval], %[bval], %[mr]\n\t" \
+            : [mr]"=r"(mr__), [mi1]"=r"(mi1__), [mi2]"=r"(mi2__), \
+              [aval]"=&r"(aval__), [bval]"=r"(bval__) \
+            : [ap]LDRD_CONS(a), [bp]"m"(b) \
+        ); \
+        (m).r = SHL32(mr__, 1); \
+        (m).i = SHL32(SUB32(mi1__, mi2__), 1); \
+    } \
+    while(0)
+
+#endif /* FIXED_POINT */
+
+#endif /* KISS_FFT_GUTS_H */
diff --git a/celt/bands.c b/celt/bands.c
index 3be543c..93bd0bc 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -40,6 +40,23 @@
 #include "os_support.h"
 #include "mathops.h"
 #include "rate.h"
+#include "quant_bands.h"
+#include "pitch.h"
+
+int hysteresis_decision(opus_val16 val, const opus_val16 *thresholds, const opus_val16 *hysteresis, int N, int prev)
+{
+   int i;
+   for (i=0;i<N;i++)
+   {
+      if (val < thresholds[i])
+         break;
+   }
+   if (i>prev && val < thresholds[prev]+hysteresis[prev])
+      i=prev;
+   if (i<prev && val > thresholds[prev-1]-hysteresis[prev-1])
+      i=prev;
+   return i;
+}
 
 opus_uint32 celt_lcg_rand(opus_uint32 seed)
 {
@@ -172,7 +189,8 @@ void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, cel
 #endif /* FIXED_POINT */
 
 /* De-normalise the energy to produce the synthesis from the unit-energy bands */
-void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X, celt_sig * OPUS_RESTRICT freq, const celt_ener *bandE, int end, int C, int M)
+void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
+      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandLogE, int start, int end, int C, int M)
 {
    int i, c, N;
    const opus_int16 *eBands = m->eBands;
@@ -182,18 +200,39 @@ void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X, cel
       celt_sig * OPUS_RESTRICT f;
       const celt_norm * OPUS_RESTRICT x;
       f = freq+c*N;
-      x = X+c*N;
-      for (i=0;i<end;i++)
+      x = X+c*N+M*eBands[start];
+      for (i=0;i<M*eBands[start];i++)
+         *f++ = 0;
+      for (i=start;i<end;i++)
       {
          int j, band_end;
-         opus_val32 g = SHR32(bandE[i+c*m->nbEBands],1);
+         opus_val16 g;
+         opus_val16 lg;
+#ifdef FIXED_POINT
+         int shift;
+#endif
          j=M*eBands[i];
          band_end = M*eBands[i+1];
+         lg = ADD16(bandLogE[i+c*m->nbEBands], SHL16((opus_val16)eMeans[i],6));
+#ifdef FIXED_POINT
+         /* Handle the integer part of the log energy */
+         shift = 16-(lg>>DB_SHIFT);
+         if (shift>31)
+         {
+            shift=0;
+            g=0;
+         } else {
+            /* Handle the fractional part. */
+            g = celt_exp2_frac(lg&((1<<DB_SHIFT)-1));
+         }
+#else
+         g = celt_exp2(lg);
+#endif
          do {
-            *f++ = SHL32(MULT16_32_Q15(*x, g),2);
-            x++;
+            *f++ = SHR32(MULT16_16(*x++, g), shift);
          } while (++j<band_end);
       }
+      celt_assert(start <= end);
       for (i=M*eBands[end];i<N;i++)
          *f++ = 0;
    } while (++c<C);
@@ -345,11 +384,7 @@ static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N)
    opus_val32 t, lgain, rgain;
 
    /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
-   for (j=0;j<N;j++)
-   {
-      xp = MAC16_16(xp, X[j], Y[j]);
-      side = MAC16_16(side, Y[j], Y[j]);
-   }
+   dual_inner_prod(Y, X, Y, N, &xp, &side);
    /* Compensating for the mid normalization */
    xp = MULT16_32_Q15(mid, xp);
    /* mid and side are in Q15, not Q14 like X and Y */
@@ -483,50 +518,6 @@ int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
    return decision;
 }
 
-#ifdef MEASURE_NORM_MSE
-
-float MSE[30] = {0};
-int nbMSEBands = 0;
-int MSECount[30] = {0};
-
-void dump_norm_mse(void)
-{
-   int i;
-   for (i=0;i<nbMSEBands;i++)
-   {
-      printf ("%g ", MSE[i]/MSECount[i]);
-   }
-   printf ("\n");
-}
-
-void measure_norm_mse(const CELTMode *m, float *X, float *X0, float *bandE, float *bandE0, int M, int N, int C)
-{
-   static int init = 0;
-   int i;
-   if (!init)
-   {
-      atexit(dump_norm_mse);
-      init = 1;
-   }
-   for (i=0;i<m->nbEBands;i++)
-   {
-      int j;
-      int c;
-      float g;
-      if (bandE0[i]<10 || (C==2 && bandE0[i+m->nbEBands]<1))
-         continue;
-      c=0; do {
-         g = bandE[i+c*m->nbEBands]/(1e-15+bandE0[i+c*m->nbEBands]);
-         for (j=M*m->eBands[i];j<M*m->eBands[i+1];j++)
-            MSE[i] += (g*X[j+c*N]-X0[j+c*N])*(g*X[j+c*N]-X0[j+c*N]);
-      } while (++c<C);
-      MSECount[i]+=C;
-   }
-   nbMSEBands = m->nbEBands;
-}
-
-#endif
-
 /* Indexing table for converting from natural Hadamard to ordery Hadamard
    This is essentially a bit-reversed Gray, on top of which we've added
    an inversion of the order because we want the DC at the end rather than
@@ -629,289 +620,304 @@ static int compute_qn(int N, int b, int offset, int pulse_cap, int stereo)
    return qn;
 }
 
-/* This function is responsible for encoding and decoding a band for both
-   the mono and stereo case. Even in the mono case, it can split the band
-   in two and transmit the energy difference with the two half-bands. It
-   can be called recursively so bands can end up being split in 8 parts. */
-static unsigned quant_band(int encode, const CELTMode *m, int i, celt_norm *X, celt_norm *Y,
-      int N, int b, int spread, int B, int intensity, int tf_change, celt_norm *lowband, ec_ctx *ec,
-      opus_int32 *remaining_bits, int LM, celt_norm *lowband_out, const celt_ener *bandE, int level,
-      opus_uint32 *seed, opus_val16 gain, celt_norm *lowband_scratch, int fill)
+struct band_ctx {
+   int encode;
+   const CELTMode *m;
+   int i;
+   int intensity;
+   int spread;
+   int tf_change;
+   ec_ctx *ec;
+   opus_int32 remaining_bits;
+   const celt_ener *bandE;
+   opus_uint32 seed;
+};
+
+struct split_ctx {
+   int inv;
+   int imid;
+   int iside;
+   int delta;
+   int itheta;
+   int qalloc;
+};
+
+static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
+      celt_norm *X, celt_norm *Y, int N, int *b, int B, int B0,
+      int LM,
+      int stereo, int *fill)
 {
-   const unsigned char *cache;
-   int q;
-   int curr_bits;
-   int stereo, split;
-   int imid=0, iside=0;
-   int N0=N;
-   int N_B=N;
-   int N_B0;
-   int B0=B;
-   int time_divide=0;
-   int recombine=0;
-   int inv = 0;
-   opus_val16 mid=0, side=0;
-   int longBlocks;
-   unsigned cm=0;
-#ifdef RESYNTH
-   int resynth = 1;
-#else
-   int resynth = !encode;
-#endif
+   int qn;
+   int itheta=0;
+   int delta;
+   int imid, iside;
+   int qalloc;
+   int pulse_cap;
+   int offset;
+   opus_int32 tell;
+   int inv=0;
+   int encode;
+   const CELTMode *m;
+   int i;
+   int intensity;
+   ec_ctx *ec;
+   const celt_ener *bandE;
+
+   encode = ctx->encode;
+   m = ctx->m;
+   i = ctx->i;
+   intensity = ctx->intensity;
+   ec = ctx->ec;
+   bandE = ctx->bandE;
+
+   /* Decide on the resolution to give to the split parameter theta */
+   pulse_cap = m->logN[i]+LM*(1<<BITRES);
+   offset = (pulse_cap>>1) - (stereo&&N==2 ? QTHETA_OFFSET_TWOPHASE : QTHETA_OFFSET);
+   qn = compute_qn(N, *b, offset, pulse_cap, stereo);
+   if (stereo && i>=intensity)
+      qn = 1;
+   if (encode)
+   {
+      /* theta is the atan() of the ratio between the (normalized)
+         side and mid. With just that parameter, we can re-scale both
+         mid and side because we know that 1) they have unit norm and
+         2) they are orthogonal. */
+      itheta = stereo_itheta(X, Y, stereo, N);
+   }
+   tell = ec_tell_frac(ec);
+   if (qn!=1)
+   {
+      if (encode)
+         itheta = (itheta*qn+8192)>>14;
 
-   longBlocks = B0==1;
+      /* Entropy coding of the angle. We use a uniform pdf for the
+         time split, a step for stereo, and a triangular one for the rest. */
+      if (stereo && N>2)
+      {
+         int p0 = 3;
+         int x = itheta;
+         int x0 = qn/2;
+         int ft = p0*(x0+1) + x0;
+         /* Use a probability of p0 up to itheta=8192 and then use 1 after */
+         if (encode)
+         {
+            ec_encode(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft);
+         } else {
+            int fs;
+            fs=ec_decode(ec,ft);
+            if (fs<(x0+1)*p0)
+               x=fs/p0;
+            else
+               x=x0+1+(fs-(x0+1)*p0);
+            ec_dec_update(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft);
+            itheta = x;
+         }
+      } else if (B0>1 || stereo) {
+         /* Uniform pdf */
+         if (encode)
+            ec_enc_uint(ec, itheta, qn+1);
+         else
+            itheta = ec_dec_uint(ec, qn+1);
+      } else {
+         int fs=1, ft;
+         ft = ((qn>>1)+1)*((qn>>1)+1);
+         if (encode)
+         {
+            int fl;
 
-   N_B /= B;
-   N_B0 = N_B;
+            fs = itheta <= (qn>>1) ? itheta + 1 : qn + 1 - itheta;
+            fl = itheta <= (qn>>1) ? itheta*(itheta + 1)>>1 :
+             ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1);
 
-   split = stereo = Y != NULL;
+            ec_encode(ec, fl, fl+fs, ft);
+         } else {
+            /* Triangular pdf */
+            int fl=0;
+            int fm;
+            fm = ec_decode(ec, ft);
 
-   /* Special case for one sample */
-   if (N==1)
-   {
-      int c;
-      celt_norm *x = X;
-      c=0; do {
-         int sign=0;
-         if (*remaining_bits>=1<<BITRES)
-         {
-            if (encode)
+            if (fm < ((qn>>1)*((qn>>1) + 1)>>1))
             {
-               sign = x[0]<0;
-               ec_enc_bits(ec, sign, 1);
-            } else {
-               sign = ec_dec_bits(ec, 1);
+               itheta = (isqrt32(8*(opus_uint32)fm + 1) - 1)>>1;
+               fs = itheta + 1;
+               fl = itheta*(itheta + 1)>>1;
+            }
+            else
+            {
+               itheta = (2*(qn + 1)
+                - isqrt32(8*(opus_uint32)(ft - fm - 1) + 1))>>1;
+               fs = qn + 1 - itheta;
+               fl = ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1);
             }
-            *remaining_bits -= 1<<BITRES;
-            b-=1<<BITRES;
-         }
-         if (resynth)
-            x[0] = sign ? -NORM_SCALING : NORM_SCALING;
-         x = Y;
-      } while (++c<1+stereo);
-      if (lowband_out)
-         lowband_out[0] = SHR16(X[0],4);
-      return 1;
-   }
-
-   if (!stereo && level == 0)
-   {
-      int k;
-      if (tf_change>0)
-         recombine = tf_change;
-      /* Band recombining to increase frequency resolution */
 
-      if (lowband && (recombine || ((N_B&1) == 0 && tf_change<0) || B0>1))
-      {
-         int j;
-         for (j=0;j<N;j++)
-            lowband_scratch[j] = lowband[j];
-         lowband = lowband_scratch;
+            ec_dec_update(ec, fl, fl+fs, ft);
+         }
       }
-
-      for (k=0;k<recombine;k++)
+      itheta = (opus_int32)itheta*16384/qn;
+      if (encode && stereo)
       {
-         static const unsigned char bit_interleave_table[16]={
-           0,1,1,1,2,3,3,3,2,3,3,3,2,3,3,3
-         };
-         if (encode)
-            haar1(X, N>>k, 1<<k);
-         if (lowband)
-            haar1(lowband, N>>k, 1<<k);
-         fill = bit_interleave_table[fill&0xF]|bit_interleave_table[fill>>4]<<2;
+         if (itheta==0)
+            intensity_stereo(m, X, Y, bandE, i, N);
+         else
+            stereo_split(X, Y, N);
       }
-      B>>=recombine;
-      N_B<<=recombine;
-
-      /* Increasing the time resolution */
-      while ((N_B&1) == 0 && tf_change<0)
+      /* NOTE: Renormalising X and Y *may* help fixed-point a bit at very high rate.
+               Let's do that at higher complexity */
+   } else if (stereo) {
+      if (encode)
       {
-         if (encode)
-            haar1(X, N_B, B);
-         if (lowband)
-            haar1(lowband, N_B, B);
-         fill |= fill<<B;
-         B <<= 1;
-         N_B >>= 1;
-         time_divide++;
-         tf_change++;
+         inv = itheta > 8192;
+         if (inv)
+         {
+            int j;
+            for (j=0;j<N;j++)
+               Y[j] = -Y[j];
+         }
+         intensity_stereo(m, X, Y, bandE, i, N);
       }
-      B0=B;
-      N_B0 = N_B;
-
-      /* Reorganize the samples in time order instead of frequency order */
-      if (B0>1)
+      if (*b>2<<BITRES && ctx->remaining_bits > 2<<BITRES)
       {
          if (encode)
-            deinterleave_hadamard(X, N_B>>recombine, B0<<recombine, longBlocks);
-         if (lowband)
-            deinterleave_hadamard(lowband, N_B>>recombine, B0<<recombine, longBlocks);
-      }
+            ec_enc_bit_logp(ec, inv, 2);
+         else
+            inv = ec_dec_bit_logp(ec, 2);
+      } else
+         inv = 0;
+      itheta = 0;
    }
+   qalloc = ec_tell_frac(ec) - tell;
+   *b -= qalloc;
 
-   /* If we need 1.5 more bit than we can produce, split the band in two. */
-   cache = m->cache.bits + m->cache.index[(LM+1)*m->nbEBands+i];
-   if (!stereo && LM != -1 && b > cache[cache[0]]+12 && N>2)
+   if (itheta == 0)
    {
-      N >>= 1;
-      Y = X+N;
-      split = 1;
-      LM -= 1;
-      if (B==1)
-         fill = (fill&1)|(fill<<1);
-      B = (B+1)>>1;
+      imid = 32767;
+      iside = 0;
+      *fill &= (1<<B)-1;
+      delta = -16384;
+   } else if (itheta == 16384)
+   {
+      imid = 0;
+      iside = 32767;
+      *fill &= ((1<<B)-1)<<B;
+      delta = 16384;
+   } else {
+      imid = bitexact_cos((opus_int16)itheta);
+      iside = bitexact_cos((opus_int16)(16384-itheta));
+      /* This is the mid vs side allocation that minimizes squared error
+         in that band. */
+      delta = FRAC_MUL16((N-1)<<7,bitexact_log2tan(iside,imid));
    }
 
-   if (split)
-   {
-      int qn;
-      int itheta=0;
-      int mbits, sbits, delta;
-      int qalloc;
-      int pulse_cap;
-      int offset;
-      int orig_fill;
-      opus_int32 tell;
+   sctx->inv = inv;
+   sctx->imid = imid;
+   sctx->iside = iside;
+   sctx->delta = delta;
+   sctx->itheta = itheta;
+   sctx->qalloc = qalloc;
+}
+static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b,
+      celt_norm *lowband_out)
+{
+#ifdef RESYNTH
+   int resynth = 1;
+#else
+   int resynth = !ctx->encode;
+#endif
+   int c;
+   int stereo;
+   celt_norm *x = X;
+   int encode;
+   ec_ctx *ec;
 
-      /* Decide on the resolution to give to the split parameter theta */
-      pulse_cap = m->logN[i]+LM*(1<<BITRES);
-      offset = (pulse_cap>>1) - (stereo&&N==2 ? QTHETA_OFFSET_TWOPHASE : QTHETA_OFFSET);
-      qn = compute_qn(N, b, offset, pulse_cap, stereo);
-      if (stereo && i>=intensity)
-         qn = 1;
-      if (encode)
-      {
-         /* theta is the atan() of the ratio between the (normalized)
-            side and mid. With just that parameter, we can re-scale both
-            mid and side because we know that 1) they have unit norm and
-            2) they are orthogonal. */
-         itheta = stereo_itheta(X, Y, stereo, N);
-      }
-      tell = ec_tell_frac(ec);
-      if (qn!=1)
+   encode = ctx->encode;
+   ec = ctx->ec;
+
+   stereo = Y != NULL;
+   c=0; do {
+      int sign=0;
+      if (ctx->remaining_bits>=1<<BITRES)
       {
          if (encode)
-            itheta = (itheta*qn+8192)>>14;
-
-         /* Entropy coding of the angle. We use a uniform pdf for the
-            time split, a step for stereo, and a triangular one for the rest. */
-         if (stereo && N>2)
          {
-            int p0 = 3;
-            int x = itheta;
-            int x0 = qn/2;
-            int ft = p0*(x0+1) + x0;
-            /* Use a probability of p0 up to itheta=8192 and then use 1 after */
-            if (encode)
-            {
-               ec_encode(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft);
-            } else {
-               int fs;
-               fs=ec_decode(ec,ft);
-               if (fs<(x0+1)*p0)
-                  x=fs/p0;
-               else
-                  x=x0+1+(fs-(x0+1)*p0);
-               ec_dec_update(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft);
-               itheta = x;
-            }
-         } else if (B0>1 || stereo) {
-            /* Uniform pdf */
-            if (encode)
-               ec_enc_uint(ec, itheta, qn+1);
-            else
-               itheta = ec_dec_uint(ec, qn+1);
+            sign = x[0]<0;
+            ec_enc_bits(ec, sign, 1);
          } else {
-            int fs=1, ft;
-            ft = ((qn>>1)+1)*((qn>>1)+1);
-            if (encode)
-            {
-               int fl;
+            sign = ec_dec_bits(ec, 1);
+         }
+         ctx->remaining_bits -= 1<<BITRES;
+         b-=1<<BITRES;
+      }
+      if (resynth)
+         x[0] = sign ? -NORM_SCALING : NORM_SCALING;
+      x = Y;
+   } while (++c<1+stereo);
+   if (lowband_out)
+      lowband_out[0] = SHR16(X[0],4);
+   return 1;
+}
 
-               fs = itheta <= (qn>>1) ? itheta + 1 : qn + 1 - itheta;
-               fl = itheta <= (qn>>1) ? itheta*(itheta + 1)>>1 :
-                ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1);
+/* This function is responsible for encoding and decoding a mono partition.
+   It can split the band in two and transmit the energy difference with
+   the two half-bands. It can be called recursively so bands can end up being
+   split in 8 parts. */
+static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X,
+      int N, int b, int B, celt_norm *lowband,
+      int LM,
+      opus_val16 gain, int fill)
+{
+   const unsigned char *cache;
+   int q;
+   int curr_bits;
+   int imid=0, iside=0;
+   int N_B=N;
+   int B0=B;
+   opus_val16 mid=0, side=0;
+   unsigned cm=0;
+#ifdef RESYNTH
+   int resynth = 1;
+#else
+   int resynth = !ctx->encode;
+#endif
+   celt_norm *Y=NULL;
+   int encode;
+   const CELTMode *m;
+   int i;
+   int spread;
+   ec_ctx *ec;
 
-               ec_encode(ec, fl, fl+fs, ft);
-            } else {
-               /* Triangular pdf */
-               int fl=0;
-               int fm;
-               fm = ec_decode(ec, ft);
+   encode = ctx->encode;
+   m = ctx->m;
+   i = ctx->i;
+   spread = ctx->spread;
+   ec = ctx->ec;
 
-               if (fm < ((qn>>1)*((qn>>1) + 1)>>1))
-               {
-                  itheta = (isqrt32(8*(opus_uint32)fm + 1) - 1)>>1;
-                  fs = itheta + 1;
-                  fl = itheta*(itheta + 1)>>1;
-               }
-               else
-               {
-                  itheta = (2*(qn + 1)
-                   - isqrt32(8*(opus_uint32)(ft - fm - 1) + 1))>>1;
-                  fs = qn + 1 - itheta;
-                  fl = ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1);
-               }
+   N_B /= B;
 
-               ec_dec_update(ec, fl, fl+fs, ft);
-            }
-         }
-         itheta = (opus_int32)itheta*16384/qn;
-         if (encode && stereo)
-         {
-            if (itheta==0)
-               intensity_stereo(m, X, Y, bandE, i, N);
-            else
-               stereo_split(X, Y, N);
-         }
-         /* NOTE: Renormalising X and Y *may* help fixed-point a bit at very high rate.
-                  Let's do that at higher complexity */
-      } else if (stereo) {
-         if (encode)
-         {
-            inv = itheta > 8192;
-            if (inv)
-            {
-               int j;
-               for (j=0;j<N;j++)
-                  Y[j] = -Y[j];
-            }
-            intensity_stereo(m, X, Y, bandE, i, N);
-         }
-         if (b>2<<BITRES && *remaining_bits > 2<<BITRES)
-         {
-            if (encode)
-               ec_enc_bit_logp(ec, inv, 2);
-            else
-               inv = ec_dec_bit_logp(ec, 2);
-         } else
-            inv = 0;
-         itheta = 0;
-      }
-      qalloc = ec_tell_frac(ec) - tell;
-      b -= qalloc;
+   /* If we need 1.5 more bit than we can produce, split the band in two. */
+   cache = m->cache.bits + m->cache.index[(LM+1)*m->nbEBands+i];
+   if (LM != -1 && b > cache[cache[0]]+12 && N>2)
+   {
+      int mbits, sbits, delta;
+      int itheta;
+      int qalloc;
+      struct split_ctx sctx;
+      celt_norm *next_lowband2=NULL;
+      opus_int32 rebalance;
 
-      orig_fill = fill;
-      if (itheta == 0)
-      {
-         imid = 32767;
-         iside = 0;
-         fill &= (1<<B)-1;
-         delta = -16384;
-      } else if (itheta == 16384)
-      {
-         imid = 0;
-         iside = 32767;
-         fill &= ((1<<B)-1)<<B;
-         delta = 16384;
-      } else {
-         imid = bitexact_cos((opus_int16)itheta);
-         iside = bitexact_cos((opus_int16)(16384-itheta));
-         /* This is the mid vs side allocation that minimizes squared error
-            in that band. */
-         delta = FRAC_MUL16((N-1)<<7,bitexact_log2tan(iside,imid));
-      }
+      N >>= 1;
+      Y = X+N;
+      LM -= 1;
+      if (B==1)
+         fill = (fill&1)|(fill<<1);
+      B = (B+1)>>1;
 
+      compute_theta(ctx, &sctx, X, Y, N, &b, B, B0,
+            LM, 0, &fill);
+      imid = sctx.imid;
+      iside = sctx.iside;
+      delta = sctx.delta;
+      itheta = sctx.itheta;
+      qalloc = sctx.qalloc;
 #ifdef FIXED_POINT
       mid = imid;
       side = iside;
@@ -920,136 +926,59 @@ static unsigned quant_band(int encode, const CELTMode *m, int i, celt_norm *X, c
       side = (1.f/32768)*iside;
 #endif
 
-      /* This is a special case for N=2 that only works for stereo and takes
-         advantage of the fact that mid and side are orthogonal to encode
-         the side with just one bit. */
-      if (N==2 && stereo)
+      /* Give more bits to low-energy MDCTs than they would otherwise deserve */
+      if (B0>1 && (itheta&0x3fff))
       {
-         int c;
-         int sign=0;
-         celt_norm *x2, *y2;
-         mbits = b;
-         sbits = 0;
-         /* Only need one bit for the side */
-         if (itheta != 0 && itheta != 16384)
-            sbits = 1<<BITRES;
-         mbits -= sbits;
-         c = itheta > 8192;
-         *remaining_bits -= qalloc+sbits;
-
-         x2 = c ? Y : X;
-         y2 = c ? X : Y;
-         if (sbits)
-         {
-            if (encode)
-            {
-               /* Here we only need to encode a sign for the side */
-               sign = x2[0]*y2[1] - x2[1]*y2[0] < 0;
-               ec_enc_bits(ec, sign, 1);
-            } else {
-               sign = ec_dec_bits(ec, 1);
-            }
-         }
-         sign = 1-2*sign;
-         /* We use orig_fill here because we want to fold the side, but if
-             itheta==16384, we'll have cleared the low bits of fill. */
-         cm = quant_band(encode, m, i, x2, NULL, N, mbits, spread, B, intensity, tf_change, lowband, ec, remaining_bits, LM, lowband_out, NULL, level, seed, gain, lowband_scratch, orig_fill);
-         /* We don't split N=2 bands, so cm is either 1 or 0 (for a fold-collapse),
-             and there's no need to worry about mixing with the other channel. */
-         y2[0] = -sign*x2[1];
-         y2[1] = sign*x2[0];
-         if (resynth)
-         {
-            celt_norm tmp;
-            X[0] = MULT16_16_Q15(mid, X[0]);
-            X[1] = MULT16_16_Q15(mid, X[1]);
-            Y[0] = MULT16_16_Q15(side, Y[0]);
-            Y[1] = MULT16_16_Q15(side, Y[1]);
-            tmp = X[0];
-            X[0] = SUB16(tmp,Y[0]);
-            Y[0] = ADD16(tmp,Y[0]);
-            tmp = X[1];
-            X[1] = SUB16(tmp,Y[1]);
-            Y[1] = ADD16(tmp,Y[1]);
-         }
-      } else {
-         /* "Normal" split code */
-         celt_norm *next_lowband2=NULL;
-         celt_norm *next_lowband_out1=NULL;
-         int next_level=0;
-         opus_int32 rebalance;
-
-         /* Give more bits to low-energy MDCTs than they would otherwise deserve */
-         if (B0>1 && !stereo && (itheta&0x3fff))
-         {
-            if (itheta > 8192)
-               /* Rough approximation for pre-echo masking */
-               delta -= delta>>(4-LM);
-            else
-               /* Corresponds to a forward-masking slope of 1.5 dB per 10 ms */
-               delta = IMIN(0, delta + (N<<BITRES>>(5-LM)));
-         }
-         mbits = IMAX(0, IMIN(b, (b-delta)/2));
-         sbits = b-mbits;
-         *remaining_bits -= qalloc;
-
-         if (lowband && !stereo)
-            next_lowband2 = lowband+N; /* >32-bit split case */
-
-         /* Only stereo needs to pass on lowband_out. Otherwise, it's
-            handled at the end */
-         if (stereo)
-            next_lowband_out1 = lowband_out;
+         if (itheta > 8192)
+            /* Rough approximation for pre-echo masking */
+            delta -= delta>>(4-LM);
          else
-            next_level = level+1;
-
-         rebalance = *remaining_bits;
-         if (mbits >= sbits)
-         {
-            /* In stereo mode, we do not apply a scaling to the mid because we need the normalized
-               mid for folding later */
-            cm = quant_band(encode, m, i, X, NULL, N, mbits, spread, B, intensity, tf_change,
-                  lowband, ec, remaining_bits, LM, next_lowband_out1,
-                  NULL, next_level, seed, stereo ? Q15ONE : MULT16_16_P15(gain,mid), lowband_scratch, fill);
-            rebalance = mbits - (rebalance-*remaining_bits);
-            if (rebalance > 3<<BITRES && itheta!=0)
-               sbits += rebalance - (3<<BITRES);
-
-            /* For a stereo split, the high bits of fill are always zero, so no
-               folding will be done to the side. */
-            cm |= quant_band(encode, m, i, Y, NULL, N, sbits, spread, B, intensity, tf_change,
-                  next_lowband2, ec, remaining_bits, LM, NULL,
-                  NULL, next_level, seed, MULT16_16_P15(gain,side), NULL, fill>>B)<<((B0>>1)&(stereo-1));
-         } else {
-            /* For a stereo split, the high bits of fill are always zero, so no
-               folding will be done to the side. */
-            cm = quant_band(encode, m, i, Y, NULL, N, sbits, spread, B, intensity, tf_change,
-                  next_lowband2, ec, remaining_bits, LM, NULL,
-                  NULL, next_level, seed, MULT16_16_P15(gain,side), NULL, fill>>B)<<((B0>>1)&(stereo-1));
-            rebalance = sbits - (rebalance-*remaining_bits);
-            if (rebalance > 3<<BITRES && itheta!=16384)
-               mbits += rebalance - (3<<BITRES);
-            /* In stereo mode, we do not apply a scaling to the mid because we need the normalized
-               mid for folding later */
-            cm |= quant_band(encode, m, i, X, NULL, N, mbits, spread, B, intensity, tf_change,
-                  lowband, ec, remaining_bits, LM, next_lowband_out1,
-                  NULL, next_level, seed, stereo ? Q15ONE : MULT16_16_P15(gain,mid), lowband_scratch, fill);
-         }
+            /* Corresponds to a forward-masking slope of 1.5 dB per 10 ms */
+            delta = IMIN(0, delta + (N<<BITRES>>(5-LM)));
       }
+      mbits = IMAX(0, IMIN(b, (b-delta)/2));
+      sbits = b-mbits;
+      ctx->remaining_bits -= qalloc;
+
+      if (lowband)
+         next_lowband2 = lowband+N; /* >32-bit split case */
 
+      rebalance = ctx->remaining_bits;
+      if (mbits >= sbits)
+      {
+         cm = quant_partition(ctx, X, N, mbits, B,
+               lowband, LM,
+               MULT16_16_P15(gain,mid), fill);
+         rebalance = mbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=0)
+            sbits += rebalance - (3<<BITRES);
+         cm |= quant_partition(ctx, Y, N, sbits, B,
+               next_lowband2, LM,
+               MULT16_16_P15(gain,side), fill>>B)<<(B0>>1);
+      } else {
+         cm = quant_partition(ctx, Y, N, sbits, B,
+               next_lowband2, LM,
+               MULT16_16_P15(gain,side), fill>>B)<<(B0>>1);
+         rebalance = sbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=16384)
+            mbits += rebalance - (3<<BITRES);
+         cm |= quant_partition(ctx, X, N, mbits, B,
+               lowband, LM,
+               MULT16_16_P15(gain,mid), fill);
+      }
    } else {
       /* This is the basic no-split case */
       q = bits2pulses(m, i, LM, b);
       curr_bits = pulses2bits(m, i, LM, q);
-      *remaining_bits -= curr_bits;
+      ctx->remaining_bits -= curr_bits;
 
       /* Ensures we can never bust the budget */
-      while (*remaining_bits < 0 && q > 0)
+      while (ctx->remaining_bits < 0 && q > 0)
       {
-         *remaining_bits += curr_bits;
+         ctx->remaining_bits += curr_bits;
          q--;
          curr_bits = pulses2bits(m, i, LM, q);
-         *remaining_bits -= curr_bits;
+         ctx->remaining_bits -= curr_bits;
       }
 
       if (q!=0)
@@ -1073,7 +1002,7 @@ static unsigned quant_band(int encode, const CELTMode *m, int i, celt_norm *X, c
          if (resynth)
          {
             unsigned cm_mask;
-            /*B can be as large as 16, so this shift might overflow an int on a
+            /* B can be as large as 16, so this shift might overflow an int on a
                16-bit platform; use a long to get defined behavior.*/
             cm_mask = (unsigned)(1UL<<B)-1;
             fill &= cm_mask;
@@ -1087,8 +1016,8 @@ static unsigned quant_band(int encode, const CELTMode *m, int i, celt_norm *X, c
                   /* Noise */
                   for (j=0;j<N;j++)
                   {
-                     *seed = celt_lcg_rand(*seed);
-                     X[j] = (celt_norm)((opus_int32)*seed>>20);
+                     ctx->seed = celt_lcg_rand(ctx->seed);
+                     X[j] = (celt_norm)((opus_int32)ctx->seed>>20);
                   }
                   cm = cm_mask;
                } else {
@@ -1096,10 +1025,10 @@ static unsigned quant_band(int encode, const CELTMode *m, int i, celt_norm *X, c
                   for (j=0;j<N;j++)
                   {
                      opus_val16 tmp;
-                     *seed = celt_lcg_rand(*seed);
+                     ctx->seed = celt_lcg_rand(ctx->seed);
                      /* About 48 dB below the "normal" folding level */
                      tmp = QCONST16(1.0f/256, 10);
-                     tmp = (*seed)&0x8000 ? tmp : -tmp;
+                     tmp = (ctx->seed)&0x8000 ? tmp : -tmp;
                      X[j] = lowband[j]+tmp;
                   }
                   cm = fill;
@@ -1110,64 +1039,307 @@ static unsigned quant_band(int encode, const CELTMode *m, int i, celt_norm *X, c
       }
    }
 
+   return cm;
+}
+
+
+/* This function is responsible for encoding and decoding a band for the mono case. */
+static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,
+      int N, int b, int B, celt_norm *lowband,
+      int LM, celt_norm *lowband_out,
+      opus_val16 gain, celt_norm *lowband_scratch, int fill)
+{
+   int N0=N;
+   int N_B=N;
+   int N_B0;
+   int B0=B;
+   int time_divide=0;
+   int recombine=0;
+   int longBlocks;
+   unsigned cm=0;
+#ifdef RESYNTH
+   int resynth = 1;
+#else
+   int resynth = !ctx->encode;
+#endif
+   int k;
+   int encode;
+   int tf_change;
+
+   encode = ctx->encode;
+   tf_change = ctx->tf_change;
+
+   longBlocks = B0==1;
+
+   N_B /= B;
+   N_B0 = N_B;
+
+   /* Special case for one sample */
+   if (N==1)
+   {
+      return quant_band_n1(ctx, X, NULL, b, lowband_out);
+   }
+
+   if (tf_change>0)
+      recombine = tf_change;
+   /* Band recombining to increase frequency resolution */
+
+   if (lowband_scratch && lowband && (recombine || ((N_B&1) == 0 && tf_change<0) || B0>1))
+   {
+      int j;
+      for (j=0;j<N;j++)
+         lowband_scratch[j] = lowband[j];
+      lowband = lowband_scratch;
+   }
+
+   for (k=0;k<recombine;k++)
+   {
+      static const unsigned char bit_interleave_table[16]={
+            0,1,1,1,2,3,3,3,2,3,3,3,2,3,3,3
+      };
+      if (encode)
+         haar1(X, N>>k, 1<<k);
+      if (lowband)
+         haar1(lowband, N>>k, 1<<k);
+      fill = bit_interleave_table[fill&0xF]|bit_interleave_table[fill>>4]<<2;
+   }
+   B>>=recombine;
+   N_B<<=recombine;
+
+   /* Increasing the time resolution */
+   while ((N_B&1) == 0 && tf_change<0)
+   {
+      if (encode)
+         haar1(X, N_B, B);
+      if (lowband)
+         haar1(lowband, N_B, B);
+      fill |= fill<<B;
+      B <<= 1;
+      N_B >>= 1;
+      time_divide++;
+      tf_change++;
+   }
+   B0=B;
+   N_B0 = N_B;
+
+   /* Reorganize the samples in time order instead of frequency order */
+   if (B0>1)
+   {
+      if (encode)
+         deinterleave_hadamard(X, N_B>>recombine, B0<<recombine, longBlocks);
+      if (lowband)
+         deinterleave_hadamard(lowband, N_B>>recombine, B0<<recombine, longBlocks);
+   }
+
+   cm = quant_partition(ctx, X, N, b, B, lowband,
+         LM, gain, fill);
+
    /* This code is used by the decoder and by the resynthesis-enabled encoder */
    if (resynth)
    {
-      if (stereo)
+      /* Undo the sample reorganization going from time order to frequency order */
+      if (B0>1)
+         interleave_hadamard(X, N_B>>recombine, B0<<recombine, longBlocks);
+
+      /* Undo time-freq changes that we did earlier */
+      N_B = N_B0;
+      B = B0;
+      for (k=0;k<time_divide;k++)
       {
-         if (N!=2)
-            stereo_merge(X, Y, mid, N);
-         if (inv)
-         {
-            int j;
-            for (j=0;j<N;j++)
-               Y[j] = -Y[j];
-         }
-      } else if (level == 0)
+         B >>= 1;
+         N_B <<= 1;
+         cm |= cm>>B;
+         haar1(X, N_B, B);
+      }
+
+      for (k=0;k<recombine;k++)
       {
-         int k;
+         static const unsigned char bit_deinterleave_table[16]={
+               0x00,0x03,0x0C,0x0F,0x30,0x33,0x3C,0x3F,
+               0xC0,0xC3,0xCC,0xCF,0xF0,0xF3,0xFC,0xFF
+         };
+         cm = bit_deinterleave_table[cm];
+         haar1(X, N0>>k, 1<<k);
+      }
+      B<<=recombine;
 
-         /* Undo the sample reorganization going from time order to frequency order */
-         if (B0>1)
-            interleave_hadamard(X, N_B>>recombine, B0<<recombine, longBlocks);
+      /* Scale output for later folding */
+      if (lowband_out)
+      {
+         int j;
+         opus_val16 n;
+         n = celt_sqrt(SHL32(EXTEND32(N0),22));
+         for (j=0;j<N0;j++)
+            lowband_out[j] = MULT16_16_Q15(n,X[j]);
+      }
+      cm &= (1<<B)-1;
+   }
+   return cm;
+}
 
-         /* Undo time-freq changes that we did earlier */
-         N_B = N_B0;
-         B = B0;
-         for (k=0;k<time_divide;k++)
-         {
-            B >>= 1;
-            N_B <<= 1;
-            cm |= cm>>B;
-            haar1(X, N_B, B);
-         }
 
-         for (k=0;k<recombine;k++)
-         {
-            static const unsigned char bit_deinterleave_table[16]={
-              0x00,0x03,0x0C,0x0F,0x30,0x33,0x3C,0x3F,
-              0xC0,0xC3,0xCC,0xCF,0xF0,0xF3,0xFC,0xFF
-            };
-            cm = bit_deinterleave_table[cm];
-            haar1(X, N0>>k, 1<<k);
-         }
-         B<<=recombine;
+/* This function is responsible for encoding and decoding a band for the stereo case. */
+static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
+      int N, int b, int B, celt_norm *lowband,
+      int LM, celt_norm *lowband_out,
+      celt_norm *lowband_scratch, int fill)
+{
+   int imid=0, iside=0;
+   int inv = 0;
+   opus_val16 mid=0, side=0;
+   unsigned cm=0;
+#ifdef RESYNTH
+   int resynth = 1;
+#else
+   int resynth = !ctx->encode;
+#endif
+   int mbits, sbits, delta;
+   int itheta;
+   int qalloc;
+   struct split_ctx sctx;
+   int orig_fill;
+   int encode;
+   ec_ctx *ec;
+
+   encode = ctx->encode;
+   ec = ctx->ec;
+
+   /* Special case for one sample */
+   if (N==1)
+   {
+      return quant_band_n1(ctx, X, Y, b, lowband_out);
+   }
+
+   orig_fill = fill;
+
+   compute_theta(ctx, &sctx, X, Y, N, &b, B, B,
+         LM, 1, &fill);
+   inv = sctx.inv;
+   imid = sctx.imid;
+   iside = sctx.iside;
+   delta = sctx.delta;
+   itheta = sctx.itheta;
+   qalloc = sctx.qalloc;
+#ifdef FIXED_POINT
+   mid = imid;
+   side = iside;
+#else
+   mid = (1.f/32768)*imid;
+   side = (1.f/32768)*iside;
+#endif
 
-         /* Scale output for later folding */
-         if (lowband_out)
+   /* This is a special case for N=2 that only works for stereo and takes
+      advantage of the fact that mid and side are orthogonal to encode
+      the side with just one bit. */
+   if (N==2)
+   {
+      int c;
+      int sign=0;
+      celt_norm *x2, *y2;
+      mbits = b;
+      sbits = 0;
+      /* Only need one bit for the side. */
+      if (itheta != 0 && itheta != 16384)
+         sbits = 1<<BITRES;
+      mbits -= sbits;
+      c = itheta > 8192;
+      ctx->remaining_bits -= qalloc+sbits;
+
+      x2 = c ? Y : X;
+      y2 = c ? X : Y;
+      if (sbits)
+      {
+         if (encode)
          {
-            int j;
-            opus_val16 n;
-            n = celt_sqrt(SHL32(EXTEND32(N0),22));
-            for (j=0;j<N0;j++)
-               lowband_out[j] = MULT16_16_Q15(n,X[j]);
+            /* Here we only need to encode a sign for the side. */
+            sign = x2[0]*y2[1] - x2[1]*y2[0] < 0;
+            ec_enc_bits(ec, sign, 1);
+         } else {
+            sign = ec_dec_bits(ec, 1);
          }
-         cm &= (1<<B)-1;
+      }
+      sign = 1-2*sign;
+      /* We use orig_fill here because we want to fold the side, but if
+         itheta==16384, we'll have cleared the low bits of fill. */
+      cm = quant_band(ctx, x2, N, mbits, B, lowband,
+            LM, lowband_out, Q15ONE, lowband_scratch, orig_fill);
+      /* We don't split N=2 bands, so cm is either 1 or 0 (for a fold-collapse),
+         and there's no need to worry about mixing with the other channel. */
+      y2[0] = -sign*x2[1];
+      y2[1] = sign*x2[0];
+      if (resynth)
+      {
+         celt_norm tmp;
+         X[0] = MULT16_16_Q15(mid, X[0]);
+         X[1] = MULT16_16_Q15(mid, X[1]);
+         Y[0] = MULT16_16_Q15(side, Y[0]);
+         Y[1] = MULT16_16_Q15(side, Y[1]);
+         tmp = X[0];
+         X[0] = SUB16(tmp,Y[0]);
+         Y[0] = ADD16(tmp,Y[0]);
+         tmp = X[1];
+         X[1] = SUB16(tmp,Y[1]);
+         Y[1] = ADD16(tmp,Y[1]);
+      }
+   } else {
+      /* "Normal" split code */
+      opus_int32 rebalance;
+
+      mbits = IMAX(0, IMIN(b, (b-delta)/2));
+      sbits = b-mbits;
+      ctx->remaining_bits -= qalloc;
+
+      rebalance = ctx->remaining_bits;
+      if (mbits >= sbits)
+      {
+         /* In stereo mode, we do not apply a scaling to the mid because we need the normalized
+            mid for folding later. */
+         cm = quant_band(ctx, X, N, mbits, B,
+               lowband, LM, lowband_out,
+               Q15ONE, lowband_scratch, fill);
+         rebalance = mbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=0)
+            sbits += rebalance - (3<<BITRES);
+
+         /* For a stereo split, the high bits of fill are always zero, so no
+            folding will be done to the side. */
+         cm |= quant_band(ctx, Y, N, sbits, B,
+               NULL, LM, NULL,
+               side, NULL, fill>>B);
+      } else {
+         /* For a stereo split, the high bits of fill are always zero, so no
+            folding will be done to the side. */
+         cm = quant_band(ctx, Y, N, sbits, B,
+               NULL, LM, NULL,
+               side, NULL, fill>>B);
+         rebalance = sbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=16384)
+            mbits += rebalance - (3<<BITRES);
+         /* In stereo mode, we do not apply a scaling to the mid because we need the normalized
+            mid for folding later. */
+         cm |= quant_band(ctx, X, N, mbits, B,
+               lowband, LM, lowband_out,
+               Q15ONE, lowband_scratch, fill);
+      }
+   }
+
+
+   /* This code is used by the decoder and by the resynthesis-enabled encoder */
+   if (resynth)
+   {
+      if (N!=2)
+         stereo_merge(X, Y, mid, N);
+      if (inv)
+      {
+         int j;
+         for (j=0;j<N;j++)
+            Y[j] = -Y[j];
       }
    }
    return cm;
 }
 
+
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
       celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
       int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
@@ -1178,27 +1350,41 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
    const opus_int16 * OPUS_RESTRICT eBands = m->eBands;
    celt_norm * OPUS_RESTRICT norm, * OPUS_RESTRICT norm2;
    VARDECL(celt_norm, _norm);
-   VARDECL(celt_norm, lowband_scratch);
+   celt_norm *lowband_scratch;
    int B;
    int M;
    int lowband_offset;
    int update_lowband = 1;
    int C = Y_ != NULL ? 2 : 1;
+   int norm_offset;
 #ifdef RESYNTH
    int resynth = 1;
 #else
    int resynth = !encode;
 #endif
+   struct band_ctx ctx;
    SAVE_STACK;
 
    M = 1<<LM;
    B = shortBlocks ? M : 1;
-   ALLOC(_norm, C*M*eBands[m->nbEBands], celt_norm);
-   ALLOC(lowband_scratch, M*(eBands[m->nbEBands]-eBands[m->nbEBands-1]), celt_norm);
+   norm_offset = M*eBands[start];
+   /* No need to allocate norm for the last band because we don't need an
+      output in that band. */
+   ALLOC(_norm, C*(M*eBands[m->nbEBands-1]-norm_offset), celt_norm);
    norm = _norm;
-   norm2 = norm + M*eBands[m->nbEBands];
+   norm2 = norm + M*eBands[m->nbEBands-1]-norm_offset;
+   /* We can use the last band as scratch space because we don't need that
+      scratch space for the last band. */
+   lowband_scratch = X_+M*eBands[m->nbEBands-1];
 
    lowband_offset = 0;
+   ctx.bandE = bandE;
+   ctx.ec = ec;
+   ctx.encode = encode;
+   ctx.intensity = intensity;
+   ctx.m = m;
+   ctx.seed = *seed;
+   ctx.spread = spread;
    for (i=start;i<end;i++)
    {
       opus_int32 tell;
@@ -1210,6 +1396,10 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
       int tf_change=0;
       unsigned x_cm;
       unsigned y_cm;
+      int last;
+
+      ctx.i = i;
+      last = (i==end-1);
 
       X = X_+M*eBands[i];
       if (Y_!=NULL)
@@ -1223,6 +1413,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
       if (i != start)
          balance -= tell;
       remaining_bits = total_bits-tell-1;
+      ctx.remaining_bits = remaining_bits;
       if (i <= codedBands-1)
       {
          curr_balance = balance / IMIN(3, codedBands-i);
@@ -1235,26 +1426,30 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
             lowband_offset = i;
 
       tf_change = tf_res[i];
+      ctx.tf_change = tf_change;
       if (i>=m->effEBands)
       {
          X=norm;
          if (Y_!=NULL)
             Y = norm;
+         lowband_scratch = NULL;
       }
+      if (i==end-1)
+         lowband_scratch = NULL;
 
       /* Get a conservative estimate of the collapse_mask's for the bands we're
-          going to be folding from. */
+         going to be folding from. */
       if (lowband_offset != 0 && (spread!=SPREAD_AGGRESSIVE || B>1 || tf_change<0))
       {
          int fold_start;
          int fold_end;
          int fold_i;
          /* This ensures we never repeat spectral content within one band */
-         effective_lowband = IMAX(M*eBands[start], M*eBands[lowband_offset]-N);
+         effective_lowband = IMAX(0, M*eBands[lowband_offset]-norm_offset-N);
          fold_start = lowband_offset;
-         while(M*eBands[--fold_start] > effective_lowband);
+         while(M*eBands[--fold_start] > effective_lowband+norm_offset);
          fold_end = lowband_offset-1;
-         while(M*eBands[++fold_end] < effective_lowband+N);
+         while(M*eBands[++fold_end] < effective_lowband+norm_offset+N);
          x_cm = y_cm = 0;
          fold_i = fold_start; do {
            x_cm |= collapse_masks[fold_i*C+0];
@@ -1262,7 +1457,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
          } while (++fold_i<fold_end);
       }
       /* Otherwise, we'll be using the LCG to fold, so all blocks will (almost
-          always) be non-zero.*/
+         always) be non-zero. */
       else
          x_cm = y_cm = (1<<B)-1;
 
@@ -1270,33 +1465,42 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
       {
          int j;
 
-         /* Switch off dual stereo to do intensity */
+         /* Switch off dual stereo to do intensity. */
          dual_stereo = 0;
          if (resynth)
-            for (j=M*eBands[start];j<M*eBands[i];j++)
+            for (j=0;j<M*eBands[i]-norm_offset;j++)
                norm[j] = HALF32(norm[j]+norm2[j]);
       }
       if (dual_stereo)
       {
-         x_cm = quant_band(encode, m, i, X, NULL, N, b/2, spread, B, intensity, tf_change,
-               effective_lowband != -1 ? norm+effective_lowband : NULL, ec, &remaining_bits, LM,
-               norm+M*eBands[i], bandE, 0, seed, Q15ONE, lowband_scratch, x_cm);
-         y_cm = quant_band(encode, m, i, Y, NULL, N, b/2, spread, B, intensity, tf_change,
-               effective_lowband != -1 ? norm2+effective_lowband : NULL, ec, &remaining_bits, LM,
-               norm2+M*eBands[i], bandE, 0, seed, Q15ONE, lowband_scratch, y_cm);
+         x_cm = quant_band(&ctx, X, N, b/2, B,
+               effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+               last?NULL:norm+M*eBands[i]-norm_offset, Q15ONE, lowband_scratch, x_cm);
+         y_cm = quant_band(&ctx, Y, N, b/2, B,
+               effective_lowband != -1 ? norm2+effective_lowband : NULL, LM,
+               last?NULL:norm2+M*eBands[i]-norm_offset, Q15ONE, lowband_scratch, y_cm);
       } else {
-         x_cm = quant_band(encode, m, i, X, Y, N, b, spread, B, intensity, tf_change,
-               effective_lowband != -1 ? norm+effective_lowband : NULL, ec, &remaining_bits, LM,
-               norm+M*eBands[i], bandE, 0, seed, Q15ONE, lowband_scratch, x_cm|y_cm);
+         if (Y!=NULL)
+         {
+            x_cm = quant_band_stereo(&ctx, X, Y, N, b, B,
+                  effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+                        last?NULL:norm+M*eBands[i]-norm_offset, lowband_scratch, x_cm|y_cm);
+         } else {
+            x_cm = quant_band(&ctx, X, N, b, B,
+                  effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+                        last?NULL:norm+M*eBands[i]-norm_offset, Q15ONE, lowband_scratch, x_cm|y_cm);
+         }
          y_cm = x_cm;
       }
       collapse_masks[i*C+0] = (unsigned char)x_cm;
       collapse_masks[i*C+C-1] = (unsigned char)y_cm;
       balance += pulses[i] + tell;
 
-      /* Update the folding position only as long as we have 1 bit/sample depth */
+      /* Update the folding position only as long as we have 1 bit/sample depth. */
       update_lowband = b>(N<<BITRES);
    }
+   *seed = ctx.seed;
+
    RESTORE_STACK;
 }
 
diff --git a/celt/bands.h b/celt/bands.h
index 9ff8ffd..96ba52a 100644
--- a/celt/bands.h
+++ b/celt/bands.h
@@ -39,7 +39,7 @@
 /** Compute the amplitude (sqrt energy) in each of the bands
  * @param m Mode data
  * @param X Spectrum
- * @param bands Square root of the energy for each band (returned)
+ * @param bandE Square root of the energy for each band (returned)
  */
 void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int M);
 
@@ -49,16 +49,17 @@ void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *band
     equal to 1
  * @param m Mode data
  * @param X Spectrum (returned normalised)
- * @param bands Square root of the energy for each band
+ * @param bandE Square root of the energy for each band
  */
 void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, celt_norm * OPUS_RESTRICT X, const celt_ener *bandE, int end, int C, int M);
 
 /** Denormalise each band of X to restore full amplitude
  * @param m Mode data
  * @param X Spectrum (returned de-normalised)
- * @param bands Square root of the energy for each band
+ * @param bandE Square root of the energy for each band
  */
-void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X, celt_sig * OPUS_RESTRICT freq, const celt_ener *bandE, int end, int C, int M);
+void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
+      celt_sig * OPUS_RESTRICT freq, const opus_val16 *bandE, int start, int end, int C, int M);
 
 #define SPREAD_NONE       (0)
 #define SPREAD_LIGHT      (1)
@@ -76,14 +77,30 @@ void measure_norm_mse(const CELTMode *m, float *X, float *X0, float *bandE, floa
 void haar1(celt_norm *X, int N0, int stride);
 
 /** Quantisation/encoding of the residual spectrum
+ * @param encode flag that indicates whether we're encoding (1) or decoding (0)
  * @param m Mode data
+ * @param start First band to process
+ * @param end Last band to process + 1
  * @param X Residual (normalised)
+ * @param Y Residual (normalised) for second channel (or NULL for mono)
+ * @param collapse_masks Anti-collapse tracking mask
+ * @param bandE Square root of the energy for each band
+ * @param pulses Bit allocation (per band) for PVQ
+ * @param shortBlocks Zero for long blocks, non-zero for short blocks
+ * @param spread Amount of spreading to use
+ * @param dual_stereo Zero for MS stereo, non-zero for dual stereo
+ * @param intensity First band to use intensity stereo
+ * @param tf_res Time-frequency resolution change
  * @param total_bits Total number of bits that can be used for the frame (including the ones already spent)
- * @param enc Entropy encoder
+ * @param balance Number of unallocated bits
+ * @param en Entropy coder state
+ * @param LM log2() of the number of 2.5 subframes in the frame
+ * @param codedBands Last band to receive bits + 1
+ * @param seed Random generator seed
  */
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
       celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
-      int time_domain, int fold, int dual_stereo, int intensity, int *tf_res,
+      int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
       opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed);
 
 void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
@@ -92,4 +109,6 @@ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_mas
 
 opus_uint32 celt_lcg_rand(opus_uint32 seed);
 
+int hysteresis_decision(opus_val16 val, const opus_val16 *thresholds, const opus_val16 *hysteresis, int N, int prev);
+
 #endif /* BANDS_H */
diff --git a/celt/celt.c b/celt/celt.c
index 9bbe852..3e0ce6e 100644
--- a/celt/celt.c
+++ b/celt/celt.c
@@ -50,62 +50,12 @@
 #include "celt_lpc.h"
 #include "vq.h"
 
-#ifndef OPUS_VERSION
-#define OPUS_VERSION "unknown"
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "unknown"
 #endif
 
-#ifdef CUSTOM_MODES
-#define OPUS_CUSTOM_NOSTATIC
-#else
-#define OPUS_CUSTOM_NOSTATIC static inline
-#endif
-
-static const unsigned char trim_icdf[11] = {126, 124, 119, 109, 87, 41, 19, 9, 4, 2, 0};
-/* Probs: NONE: 21.875%, LIGHT: 6.25%, NORMAL: 65.625%, AGGRESSIVE: 6.25% */
-static const unsigned char spread_icdf[4] = {25, 23, 2, 0};
-
-static const unsigned char tapset_icdf[3]={2,1,0};
-
-#ifdef CUSTOM_MODES
-static const unsigned char toOpusTable[20] = {
-      0xE0, 0xE8, 0xF0, 0xF8,
-      0xC0, 0xC8, 0xD0, 0xD8,
-      0xA0, 0xA8, 0xB0, 0xB8,
-      0x00, 0x00, 0x00, 0x00,
-      0x80, 0x88, 0x90, 0x98,
-};
-
-static const unsigned char fromOpusTable[16] = {
-      0x80, 0x88, 0x90, 0x98,
-      0x40, 0x48, 0x50, 0x58,
-      0x20, 0x28, 0x30, 0x38,
-      0x00, 0x08, 0x10, 0x18
-};
-
-static inline int toOpus(unsigned char c)
-{
-   int ret=0;
-   if (c<0xA0)
-      ret = toOpusTable[c>>3];
-   if (ret == 0)
-      return -1;
-   else
-      return ret|(c&0x7);
-}
 
-static inline int fromOpus(unsigned char c)
-{
-   if (c<0x80)
-      return -1;
-   else
-      return fromOpusTable[(c>>3)-16] | (c&0x7);
-}
-#endif /* CUSTOM_MODES */
-
-#define COMBFILTER_MAXPERIOD 1024
-#define COMBFILTER_MINPERIOD 15
-
-static int resampling_factor(opus_int32 rate)
+int resampling_factor(opus_int32 rate)
 {
    int ret;
    switch (rate)
@@ -135,658 +85,101 @@ static int resampling_factor(opus_int32 rate)
    return ret;
 }
 
-/** Encoder state
- @brief Encoder state
- */
-struct OpusCustomEncoder {
-   const OpusCustomMode *mode;     /**< Mode used by the encoder */
-   int overlap;
-   int channels;
-   int stream_channels;
-
-   int force_intra;
-   int clip;
-   int disable_pf;
-   int complexity;
-   int upsample;
-   int start, end;
-
-   opus_int32 bitrate;
-   int vbr;
-   int signalling;
-   int constrained_vbr;      /* If zero, VBR can do whatever it likes with the rate */
-   int loss_rate;
-   int lsb_depth;
-
-   /* Everything beyond this point gets cleared on a reset */
-#define ENCODER_RESET_START rng
-
-   opus_uint32 rng;
-   int spread_decision;
-   opus_val32 delayedIntra;
-   int tonal_average;
-   int lastCodedBands;
-   int hf_average;
-   int tapset_decision;
-
-   int prefilter_period;
-   opus_val16 prefilter_gain;
-   int prefilter_tapset;
-#ifdef RESYNTH
-   int prefilter_period_old;
-   opus_val16 prefilter_gain_old;
-   int prefilter_tapset_old;
-#endif
-   int consec_transient;
-
-   opus_val32 preemph_memE[2];
-   opus_val32 preemph_memD[2];
-
-   /* VBR-related parameters */
-   opus_int32 vbr_reservoir;
-   opus_int32 vbr_drift;
-   opus_int32 vbr_offset;
-   opus_int32 vbr_count;
-
-#ifdef RESYNTH
-   celt_sig syn_mem[2][2*MAX_PERIOD];
-#endif
-
-   celt_sig in_mem[1]; /* Size = channels*mode->overlap */
-   /* celt_sig prefilter_mem[],  Size = channels*COMBFILTER_MAXPERIOD */
-   /* opus_val16 oldBandE[],     Size = channels*mode->nbEBands */
-   /* opus_val16 oldLogE[],      Size = channels*mode->nbEBands */
-   /* opus_val16 oldLogE2[],     Size = channels*mode->nbEBands */
-#ifdef RESYNTH
-   /* opus_val16 overlap_mem[],  Size = channels*overlap */
-#endif
-};
-
-int celt_encoder_get_size(int channels)
-{
-   CELTMode *mode = opus_custom_mode_create(48000, 960, NULL);
-   return opus_custom_encoder_get_size(mode, channels);
-}
-
-OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_get_size(const CELTMode *mode, int channels)
-{
-   int size = sizeof(struct CELTEncoder)
-         + (channels*mode->overlap-1)*sizeof(celt_sig)    /* celt_sig in_mem[channels*mode->overlap]; */
-         + channels*COMBFILTER_MAXPERIOD*sizeof(celt_sig) /* celt_sig prefilter_mem[channels*COMBFILTER_MAXPERIOD]; */
-         + 3*channels*mode->nbEBands*sizeof(opus_val16);  /* opus_val16 oldBandE[channels*mode->nbEBands]; */
-                                                          /* opus_val16 oldLogE[channels*mode->nbEBands]; */
-                                                          /* opus_val16 oldLogE2[channels*mode->nbEBands]; */
-#ifdef RESYNTH
-   size += channels*mode->overlap*sizeof(celt_sig);       /* celt_sig overlap_mem[channels*mode->nbEBands]; */
-#endif
-   return size;
-}
-
-#ifdef CUSTOM_MODES
-CELTEncoder *opus_custom_encoder_create(const CELTMode *mode, int channels, int *error)
-{
-   int ret;
-   CELTEncoder *st = (CELTEncoder *)opus_alloc(opus_custom_encoder_get_size(mode, channels));
-   /* init will handle the NULL case */
-   ret = opus_custom_encoder_init(st, mode, channels);
-   if (ret != OPUS_OK)
-   {
-      opus_custom_encoder_destroy(st);
-      st = NULL;
-   }
-   if (error)
-      *error = ret;
-   return st;
-}
-#endif /* CUSTOM_MODES */
-
-int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels)
-{
-   int ret;
-   ret = opus_custom_encoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels);
-   if (ret != OPUS_OK)
-      return ret;
-   st->upsample = resampling_factor(sampling_rate);
-   return OPUS_OK;
-}
-
-OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels)
-{
-   if (channels < 0 || channels > 2)
-      return OPUS_BAD_ARG;
-
-   if (st==NULL || mode==NULL)
-      return OPUS_ALLOC_FAIL;
-
-   OPUS_CLEAR((char*)st, opus_custom_encoder_get_size(mode, channels));
-
-   st->mode = mode;
-   st->overlap = mode->overlap;
-   st->stream_channels = st->channels = channels;
-
-   st->upsample = 1;
-   st->start = 0;
-   st->end = st->mode->effEBands;
-   st->signalling = 1;
-
-   st->constrained_vbr = 1;
-   st->clip = 1;
-
-   st->bitrate = OPUS_BITRATE_MAX;
-   st->vbr = 0;
-   st->force_intra  = 0;
-   st->complexity = 5;
-   st->lsb_depth=24;
-
-   opus_custom_encoder_ctl(st, OPUS_RESET_STATE);
-
-   return OPUS_OK;
-}
-
-#ifdef CUSTOM_MODES
-void opus_custom_encoder_destroy(CELTEncoder *st)
-{
-   opus_free(st);
-}
-#endif /* CUSTOM_MODES */
-
-static inline opus_val16 SIG2WORD16(celt_sig x)
-{
-#ifdef FIXED_POINT
-   x = PSHR32(x, SIG_SHIFT);
-   x = MAX32(x, -32768);
-   x = MIN32(x, 32767);
-   return EXTRACT16(x);
-#else
-   return (opus_val16)x;
-#endif
-}
-
-static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int C,
-                              int overlap)
+#ifndef OVERRIDE_COMB_FILTER_CONST
+static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
 {
+   opus_val32 x0, x1, x2, x3, x4;
    int i;
-   VARDECL(opus_val16, tmp);
-   opus_val32 mem0=0,mem1=0;
-   int is_transient = 0;
-   int block;
-   int N;
-   VARDECL(opus_val16, bins);
-   SAVE_STACK;
-   ALLOC(tmp, len, opus_val16);
-
-   block = overlap/2;
-   N=len/block;
-   ALLOC(bins, N, opus_val16);
-   if (C==1)
-   {
-      for (i=0;i<len;i++)
-         tmp[i] = SHR32(in[i],SIG_SHIFT);
-   } else {
-      for (i=0;i<len;i++)
-         tmp[i] = SHR32(ADD32(in[i],in[i+len]), SIG_SHIFT+1);
-   }
-
-   /* High-pass filter: (1 - 2*z^-1 + z^-2) / (1 - z^-1 + .5*z^-2) */
-   for (i=0;i<len;i++)
-   {
-      opus_val32 x,y;
-      x = tmp[i];
-      y = ADD32(mem0, x);
-#ifdef FIXED_POINT
-      mem0 = mem1 + y - SHL32(x,1);
-      mem1 = x - SHR32(y,1);
-#else
-      mem0 = mem1 + y - 2*x;
-      mem1 = x - .5f*y;
-#endif
-      tmp[i] = EXTRACT16(SHR32(y,2));
-   }
-   /* First few samples are bad because we don't propagate the memory */
-   for (i=0;i<12;i++)
-      tmp[i] = 0;
-
+   x4 = x[-T-2];
+   x3 = x[-T-1];
+   x2 = x[-T];
+   x1 = x[-T+1];
    for (i=0;i<N;i++)
    {
-      int j;
-      opus_val16 max_abs=0;
-      for (j=0;j<block;j++)
-         max_abs = MAX16(max_abs, ABS16(tmp[i*block+j]));
-      bins[i] = max_abs;
-   }
-   for (i=0;i<N;i++)
-   {
-      int j;
-      int conseq=0;
-      opus_val16 t1, t2, t3;
-
-      t1 = MULT16_16_Q15(QCONST16(.15f, 15), bins[i]);
-      t2 = MULT16_16_Q15(QCONST16(.4f, 15), bins[i]);
-      t3 = MULT16_16_Q15(QCONST16(.15f, 15), bins[i]);
-      for (j=0;j<i;j++)
-      {
-         if (bins[j] < t1)
-            conseq++;
-         if (bins[j] < t2)
-            conseq++;
-         else
-            conseq = 0;
-      }
-      if (conseq>=3)
-         is_transient=1;
-      conseq = 0;
-      for (j=i+1;j<N;j++)
-      {
-         if (bins[j] < t3)
-            conseq++;
-         else
-            conseq = 0;
-      }
-      if (conseq>=7)
-         is_transient=1;
+      x0=x[i-T+2];
+      y[i] = x[i]
+               + MULT16_32_Q15(g10,x2)
+               + MULT16_32_Q15(g11,ADD32(x1,x3))
+               + MULT16_32_Q15(g12,ADD32(x0,x4));
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
    }
-   RESTORE_STACK;
-#ifdef FUZZING
-   is_transient = rand()&0x1;
-#endif
-   return is_transient;
-}
 
-/** Apply window and compute the MDCT for all sub-frames and
-    all channels in a frame */
-static void compute_mdcts(const CELTMode *mode, int shortBlocks, celt_sig * OPUS_RESTRICT in, celt_sig * OPUS_RESTRICT out, int C, int LM)
-{
-   if (C==1 && !shortBlocks)
-   {
-      const int overlap = OVERLAP(mode);
-      clt_mdct_forward(&mode->mdct, in, out, mode->window, overlap, mode->maxLM-LM, 1);
-   } else {
-      const int overlap = OVERLAP(mode);
-      int N = mode->shortMdctSize<<LM;
-      int B = 1;
-      int b, c;
-      if (shortBlocks)
-      {
-         N = mode->shortMdctSize;
-         B = shortBlocks;
-      }
-      c=0; do {
-         for (b=0;b<B;b++)
-         {
-            /* Interleaving the sub-frames while doing the MDCTs */
-            clt_mdct_forward(&mode->mdct, in+c*(B*N+overlap)+b*N, &out[b+c*N*B], mode->window, overlap, shortBlocks ? mode->maxLM : mode->maxLM-LM, B);
-         }
-      } while (++c<C);
-   }
-}
-
-/** Compute the IMDCT and apply window for all sub-frames and
-    all channels in a frame */
-static void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
-      celt_sig * OPUS_RESTRICT out_mem[],
-      celt_sig * OPUS_RESTRICT overlap_mem[], int C, int LM)
-{
-   int c;
-   const int N = mode->shortMdctSize<<LM;
-   const int overlap = OVERLAP(mode);
-   VARDECL(opus_val32, x);
-   SAVE_STACK;
-
-   ALLOC(x, N+overlap, opus_val32);
-   c=0; do {
-      int j;
-      int b;
-      int N2 = N;
-      int B = 1;
-
-      if (shortBlocks)
-      {
-         N2 = mode->shortMdctSize;
-         B = shortBlocks;
-      }
-      /* Prevents problems from the imdct doing the overlap-add */
-      OPUS_CLEAR(x, overlap);
-
-      for (b=0;b<B;b++)
-      {
-         /* IMDCT on the interleaved the sub-frames */
-         clt_mdct_backward(&mode->mdct, &X[b+c*N2*B], x+N2*b, mode->window, overlap, shortBlocks ? mode->maxLM : mode->maxLM-LM, B);
-      }
-
-      for (j=0;j<overlap;j++)
-         out_mem[c][j] = x[j] + overlap_mem[c][j];
-      for (;j<N;j++)
-         out_mem[c][j] = x[j];
-      for (j=0;j<overlap;j++)
-         overlap_mem[c][j] = x[N+j];
-   } while (++c<C);
-   RESTORE_STACK;
-}
-
-static void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem)
-{
-   int c;
-   int count=0;
-   c=0; do {
-      int j;
-      celt_sig * OPUS_RESTRICT x;
-      opus_val16  * OPUS_RESTRICT y;
-      celt_sig m = mem[c];
-      x =in[c];
-      y = pcm+c;
-      for (j=0;j<N;j++)
-      {
-         celt_sig tmp = *x + m;
-         m = MULT16_32_Q15(coef[0], tmp)
-           - MULT16_32_Q15(coef[1], *x);
-         tmp = SHL32(MULT16_32_Q15(coef[3], tmp), 2);
-         x++;
-         /* Technically the store could be moved outside of the if because
-            the stores we don't want will just be overwritten */
-         if (count==0)
-            *y = SCALEOUT(SIG2WORD16(tmp));
-         if (++count==downsample)
-         {
-            y+=C;
-            count=0;
-         }
-      }
-      mem[c] = m;
-   } while (++c<C);
 }
+#endif
 
-static void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
+void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
       const opus_val16 *window, int overlap)
 {
    int i;
    /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
    opus_val16 g00, g01, g02, g10, g11, g12;
+   opus_val32 x0, x1, x2, x3, x4;
    static const opus_val16 gains[3][3] = {
          {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
          {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
          {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
+
+   if (g0==0 && g1==0)
+   {
+      /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
+      if (x!=y)
+         OPUS_MOVE(y, x, N);
+      return;
+   }
    g00 = MULT16_16_Q15(g0, gains[tapset0][0]);
    g01 = MULT16_16_Q15(g0, gains[tapset0][1]);
    g02 = MULT16_16_Q15(g0, gains[tapset0][2]);
    g10 = MULT16_16_Q15(g1, gains[tapset1][0]);
    g11 = MULT16_16_Q15(g1, gains[tapset1][1]);
    g12 = MULT16_16_Q15(g1, gains[tapset1][2]);
+   x1 = x[-T1+1];
+   x2 = x[-T1  ];
+   x3 = x[-T1-1];
+   x4 = x[-T1-2];
    for (i=0;i<overlap;i++)
    {
       opus_val16 f;
+      x0=x[i-T1+2];
       f = MULT16_16_Q15(window[i],window[i]);
       y[i] = x[i]
                + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g00),x[i-T0])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),x[i-T0-1])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),x[i-T0+1])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),x[i-T0-2])
-               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),x[i-T0+2])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g10),x[i-T1])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g11),x[i-T1-1])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g11),x[i-T1+1])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g12),x[i-T1-2])
-               + MULT16_32_Q15(MULT16_16_Q15(f,g12),x[i-T1+2]);
+               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g01),ADD32(x[i-T0+1],x[i-T0-1]))
+               + MULT16_32_Q15(MULT16_16_Q15((Q15ONE-f),g02),ADD32(x[i-T0+2],x[i-T0-2]))
+               + MULT16_32_Q15(MULT16_16_Q15(f,g10),x2)
+               + MULT16_32_Q15(MULT16_16_Q15(f,g11),ADD32(x1,x3))
+               + MULT16_32_Q15(MULT16_16_Q15(f,g12),ADD32(x0,x4));
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
 
    }
-   for (i=overlap;i<N;i++)
-      y[i] = x[i]
-               + MULT16_32_Q15(g10,x[i-T1])
-               + MULT16_32_Q15(g11,x[i-T1-1])
-               + MULT16_32_Q15(g11,x[i-T1+1])
-               + MULT16_32_Q15(g12,x[i-T1-2])
-               + MULT16_32_Q15(g12,x[i-T1+2]);
+   if (g1==0)
+   {
+      /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
+      if (x!=y)
+         OPUS_MOVE(y+overlap, x+overlap, N-overlap);
+      return;
+   }
+
+   /* Compute the part with the constant filter. */
+   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
 }
 
-static const signed char tf_select_table[4][8] = {
+const signed char tf_select_table[4][8] = {
       {0, -1, 0, -1,    0,-1, 0,-1},
       {0, -1, 0, -2,    1, 0, 1,-1},
       {0, -2, 0, -3,    2, 0, 1,-1},
       {0, -2, 0, -3,    3, 0, 1,-1},
 };
 
-static opus_val32 l1_metric(const celt_norm *tmp, int N, int LM, int width)
-{
-   int i, j;
-   static const opus_val16 sqrtM_1[4] = {Q15ONE, QCONST16(.70710678f,15), QCONST16(0.5f,15), QCONST16(0.35355339f,15)};
-   opus_val32 L1;
-   opus_val16 bias;
-   L1=0;
-   for (i=0;i<1<<LM;i++)
-   {
-      opus_val32 L2 = 0;
-      for (j=0;j<N>>LM;j++)
-         L2 = MAC16_16(L2, tmp[(j<<LM)+i], tmp[(j<<LM)+i]);
-      L1 += celt_sqrt(L2);
-   }
-   L1 = MULT16_32_Q15(sqrtM_1[LM], L1);
-   if (width==1)
-      bias = QCONST16(.12f,15)*LM;
-   else if (width==2)
-      bias = QCONST16(.05f,15)*LM;
-   else
-      bias = QCONST16(.02f,15)*LM;
-   L1 = MAC16_32_Q15(L1, bias, L1);
-   return L1;
-}
-
-static int tf_analysis(const CELTMode *m, int len, int C, int isTransient,
-      int *tf_res, int nbCompressedBytes, celt_norm *X, int N0, int LM,
-      int start, int *tf_sum)
-{
-   int i;
-   VARDECL(int, metric);
-   int cost0;
-   int cost1;
-   VARDECL(int, path0);
-   VARDECL(int, path1);
-   VARDECL(celt_norm, tmp);
-   int lambda;
-   int tf_select=0;
-   SAVE_STACK;
-
-   if (nbCompressedBytes<15*C || start!=0)
-   {
-      *tf_sum = 0;
-      for (i=0;i<len;i++)
-         tf_res[i] = isTransient;
-      return 0;
-   }
-   if (nbCompressedBytes<40)
-      lambda = 12;
-   else if (nbCompressedBytes<60)
-      lambda = 6;
-   else if (nbCompressedBytes<100)
-      lambda = 4;
-   else
-      lambda = 3;
-
-   ALLOC(metric, len, int);
-   ALLOC(tmp, (m->eBands[len]-m->eBands[len-1])<<LM, celt_norm);
-   ALLOC(path0, len, int);
-   ALLOC(path1, len, int);
-
-   *tf_sum = 0;
-   for (i=0;i<len;i++)
-   {
-      int j, k, N;
-      opus_val32 L1, best_L1;
-      int best_level=0;
-      N = (m->eBands[i+1]-m->eBands[i])<<LM;
-      for (j=0;j<N;j++)
-         tmp[j] = X[j+(m->eBands[i]<<LM)];
-      /* Just add the right channel if we're in stereo */
-      if (C==2)
-         for (j=0;j<N;j++)
-            tmp[j] = ADD16(SHR16(tmp[j], 1),SHR16(X[N0+j+(m->eBands[i]<<LM)], 1));
-      L1 = l1_metric(tmp, N, isTransient ? LM : 0, N>>LM);
-      best_L1 = L1;
-      /*printf ("%f ", L1);*/
-      for (k=0;k<LM;k++)
-      {
-         int B;
-
-         if (isTransient)
-            B = (LM-k-1);
-         else
-            B = k+1;
-
-         if (isTransient)
-            haar1(tmp, N>>(LM-k), 1<<(LM-k));
-         else
-            haar1(tmp, N>>k, 1<<k);
-
-         L1 = l1_metric(tmp, N, B, N>>LM);
-
-         if (L1 < best_L1)
-         {
-            best_L1 = L1;
-            best_level = k+1;
-         }
-      }
-      /*printf ("%d ", isTransient ? LM-best_level : best_level);*/
-      if (isTransient)
-         metric[i] = best_level;
-      else
-         metric[i] = -best_level;
-      *tf_sum += metric[i];
-   }
-   /*printf("\n");*/
-   /* NOTE: Future optimized implementations could detect extreme transients and set
-      tf_select = 1 but so far we have not found a reliable way of making this useful */
-   tf_select = 0;
-
-   cost0 = 0;
-   cost1 = isTransient ? 0 : lambda;
-   /* Viterbi forward pass */
-   for (i=1;i<len;i++)
-   {
-      int curr0, curr1;
-      int from0, from1;
-
-      from0 = cost0;
-      from1 = cost1 + lambda;
-      if (from0 < from1)
-      {
-         curr0 = from0;
-         path0[i]= 0;
-      } else {
-         curr0 = from1;
-         path0[i]= 1;
-      }
-
-      from0 = cost0 + lambda;
-      from1 = cost1;
-      if (from0 < from1)
-      {
-         curr1 = from0;
-         path1[i]= 0;
-      } else {
-         curr1 = from1;
-         path1[i]= 1;
-      }
-      cost0 = curr0 + abs(metric[i]-tf_select_table[LM][4*isTransient+2*tf_select+0]);
-      cost1 = curr1 + abs(metric[i]-tf_select_table[LM][4*isTransient+2*tf_select+1]);
-   }
-   tf_res[len-1] = cost0 < cost1 ? 0 : 1;
-   /* Viterbi backward pass to check the decisions */
-   for (i=len-2;i>=0;i--)
-   {
-      if (tf_res[i+1] == 1)
-         tf_res[i] = path1[i+1];
-      else
-         tf_res[i] = path0[i+1];
-   }
-   RESTORE_STACK;
-#ifdef FUZZING
-   tf_select = rand()&0x1;
-   tf_res[0] = rand()&0x1;
-   for (i=1;i<len;i++)
-      tf_res[i] = tf_res[i-1] ^ ((rand()&0xF) == 0);
-#endif
-   return tf_select;
-}
-
-static void tf_encode(int start, int end, int isTransient, int *tf_res, int LM, int tf_select, ec_enc *enc)
-{
-   int curr, i;
-   int tf_select_rsv;
-   int tf_changed;
-   int logp;
-   opus_uint32 budget;
-   opus_uint32 tell;
-   budget = enc->storage*8;
-   tell = ec_tell(enc);
-   logp = isTransient ? 2 : 4;
-   /* Reserve space to code the tf_select decision. */
-   tf_select_rsv = LM>0 && tell+logp+1 <= budget;
-   budget -= tf_select_rsv;
-   curr = tf_changed = 0;
-   for (i=start;i<end;i++)
-   {
-      if (tell+logp<=budget)
-      {
-         ec_enc_bit_logp(enc, tf_res[i] ^ curr, logp);
-         tell = ec_tell(enc);
-         curr = tf_res[i];
-         tf_changed |= curr;
-      }
-      else
-         tf_res[i] = curr;
-      logp = isTransient ? 4 : 5;
-   }
-   /* Only code tf_select if it would actually make a difference. */
-   if (tf_select_rsv &&
-         tf_select_table[LM][4*isTransient+0+tf_changed]!=
-         tf_select_table[LM][4*isTransient+2+tf_changed])
-      ec_enc_bit_logp(enc, tf_select, 1);
-   else
-      tf_select = 0;
-   for (i=start;i<end;i++)
-      tf_res[i] = tf_select_table[LM][4*isTransient+2*tf_select+tf_res[i]];
-   /*printf("%d %d ", isTransient, tf_select); for(i=0;i<end;i++)printf("%d ", tf_res[i]);printf("\n");*/
-}
-
-static void tf_decode(int start, int end, int isTransient, int *tf_res, int LM, ec_dec *dec)
-{
-   int i, curr, tf_select;
-   int tf_select_rsv;
-   int tf_changed;
-   int logp;
-   opus_uint32 budget;
-   opus_uint32 tell;
-
-   budget = dec->storage*8;
-   tell = ec_tell(dec);
-   logp = isTransient ? 2 : 4;
-   tf_select_rsv = LM>0 && tell+logp+1<=budget;
-   budget -= tf_select_rsv;
-   tf_changed = curr = 0;
-   for (i=start;i<end;i++)
-   {
-      if (tell+logp<=budget)
-      {
-         curr ^= ec_dec_bit_logp(dec, logp);
-         tell = ec_tell(dec);
-         tf_changed |= curr;
-      }
-      tf_res[i] = curr;
-      logp = isTransient ? 4 : 5;
-   }
-   tf_select = 0;
-   if (tf_select_rsv &&
-     tf_select_table[LM][4*isTransient+0+tf_changed] !=
-     tf_select_table[LM][4*isTransient+2+tf_changed])
-   {
-      tf_select = ec_dec_bit_logp(dec, 1);
-   }
-   for (i=start;i<end;i++)
-   {
-      tf_res[i] = tf_select_table[LM][4*isTransient+2*tf_select+tf_res[i]];
-   }
-}
 
-static void init_caps(const CELTMode *m,int *cap,int LM,int C)
+void init_caps(const CELTMode *m,int *cap,int LM,int C)
 {
    int i;
    for (i=0;i<m->nbEBands;i++)
@@ -797,2082 +190,6 @@ static void init_caps(const CELTMode *m,int *cap,int LM,int C)
    }
 }
 
-static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
-      const opus_val16 *bandLogE, int end, int LM, int C, int N0)
-{
-   int i;
-   opus_val32 diff=0;
-   int c;
-   int trim_index = 5;
-   if (C==2)
-   {
-      opus_val16 sum = 0; /* Q10 */
-      /* Compute inter-channel correlation for low frequencies */
-      for (i=0;i<8;i++)
-      {
-         int j;
-         opus_val32 partial = 0;
-         for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
-            partial = MAC16_16(partial, X[j], X[N0+j]);
-         sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
-      }
-      sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
-      /*printf ("%f\n", sum);*/
-      if (sum > QCONST16(.995f,10))
-         trim_index-=4;
-      else if (sum > QCONST16(.92f,10))
-         trim_index-=3;
-      else if (sum > QCONST16(.85f,10))
-         trim_index-=2;
-      else if (sum > QCONST16(.8f,10))
-         trim_index-=1;
-   }
-
-   /* Estimate spectral tilt */
-   c=0; do {
-      for (i=0;i<end-1;i++)
-      {
-         diff += bandLogE[i+c*m->nbEBands]*(opus_int32)(2+2*i-m->nbEBands);
-      }
-   } while (++c<C);
-   /* We divide by two here to avoid making the tilt larger for stereo as a
-      result of a bug in the loop above */
-   diff /= 2*C*(end-1);
-   /*printf("%f\n", diff);*/
-   if (diff > QCONST16(2.f, DB_SHIFT))
-      trim_index--;
-   if (diff > QCONST16(8.f, DB_SHIFT))
-      trim_index--;
-   if (diff < -QCONST16(4.f, DB_SHIFT))
-      trim_index++;
-   if (diff < -QCONST16(10.f, DB_SHIFT))
-      trim_index++;
-
-   if (trim_index<0)
-      trim_index = 0;
-   if (trim_index>10)
-      trim_index = 10;
-#ifdef FUZZING
-   trim_index = rand()%11;
-#endif
-   return trim_index;
-}
-
-static int stereo_analysis(const CELTMode *m, const celt_norm *X,
-      int LM, int N0)
-{
-   int i;
-   int thetas;
-   opus_val32 sumLR = EPSILON, sumMS = EPSILON;
-
-   /* Use the L1 norm to model the entropy of the L/R signal vs the M/S signal */
-   for (i=0;i<13;i++)
-   {
-      int j;
-      for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
-      {
-         opus_val32 L, R, M, S;
-         /* We cast to 32-bit first because of the -32768 case */
-         L = EXTEND32(X[j]);
-         R = EXTEND32(X[N0+j]);
-         M = ADD32(L, R);
-         S = SUB32(L, R);
-         sumLR = ADD32(sumLR, ADD32(ABS32(L), ABS32(R)));
-         sumMS = ADD32(sumMS, ADD32(ABS32(M), ABS32(S)));
-      }
-   }
-   sumMS = MULT16_32_Q15(QCONST16(0.707107f, 15), sumMS);
-   thetas = 13;
-   /* We don't need thetas for lower bands with LM<=1 */
-   if (LM<=1)
-      thetas -= 8;
-   return MULT16_32_Q15((m->eBands[13]<<(LM+1))+thetas, sumMS)
-         > MULT16_32_Q15(m->eBands[13]<<(LM+1), sumLR);
-}
-
-int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc)
-{
-   int i, c, N;
-   opus_int32 bits;
-   ec_enc _enc;
-   VARDECL(celt_sig, in);
-   VARDECL(celt_sig, freq);
-   VARDECL(celt_norm, X);
-   VARDECL(celt_ener, bandE);
-   VARDECL(opus_val16, bandLogE);
-   VARDECL(int, fine_quant);
-   VARDECL(opus_val16, error);
-   VARDECL(int, pulses);
-   VARDECL(int, cap);
-   VARDECL(int, offsets);
-   VARDECL(int, fine_priority);
-   VARDECL(int, tf_res);
-   VARDECL(unsigned char, collapse_masks);
-   celt_sig *prefilter_mem;
-   opus_val16 *oldBandE, *oldLogE, *oldLogE2;
-   int shortBlocks=0;
-   int isTransient=0;
-   const int CC = st->channels;
-   const int C = st->stream_channels;
-   int LM, M;
-   int tf_select;
-   int nbFilledBytes, nbAvailableBytes;
-   int effEnd;
-   int codedBands;
-   int tf_sum;
-   int alloc_trim;
-   int pitch_index=COMBFILTER_MINPERIOD;
-   opus_val16 gain1 = 0;
-   int intensity=0;
-   int dual_stereo=0;
-   int effectiveBytes;
-   opus_val16 pf_threshold;
-   int dynalloc_logp;
-   opus_int32 vbr_rate;
-   opus_int32 total_bits;
-   opus_int32 total_boost;
-   opus_int32 balance;
-   opus_int32 tell;
-   int prefilter_tapset=0;
-   int pf_on;
-   int anti_collapse_rsv;
-   int anti_collapse_on=0;
-   int silence=0;
-   ALLOC_STACK;
-
-   if (nbCompressedBytes<2 || pcm==NULL)
-     return OPUS_BAD_ARG;
-
-   frame_size *= st->upsample;
-   for (LM=0;LM<=st->mode->maxLM;LM++)
-      if (st->mode->shortMdctSize<<LM==frame_size)
-         break;
-   if (LM>st->mode->maxLM)
-      return OPUS_BAD_ARG;
-   M=1<<LM;
-   N = M*st->mode->shortMdctSize;
-
-   prefilter_mem = st->in_mem+CC*(st->overlap);
-   oldBandE = (opus_val16*)(st->in_mem+CC*(st->overlap+COMBFILTER_MAXPERIOD));
-   oldLogE = oldBandE + CC*st->mode->nbEBands;
-   oldLogE2 = oldLogE + CC*st->mode->nbEBands;
-
-   if (enc==NULL)
-   {
-      tell=1;
-      nbFilledBytes=0;
-   } else {
-      tell=ec_tell(enc);
-      nbFilledBytes=(tell+4)>>3;
-   }
-
-#ifdef CUSTOM_MODES
-   if (st->signalling && enc==NULL)
-   {
-      int tmp = (st->mode->effEBands-st->end)>>1;
-      st->end = IMAX(1, st->mode->effEBands-tmp);
-      compressed[0] = tmp<<5;
-      compressed[0] |= LM<<3;
-      compressed[0] |= (C==2)<<2;
-      /* Convert "standard mode" to Opus header */
-      if (st->mode->Fs==48000 && st->mode->shortMdctSize==120)
-      {
-         int c0 = toOpus(compressed[0]);
-         if (c0<0)
-            return OPUS_BAD_ARG;
-         compressed[0] = c0;
-      }
-      compressed++;
-      nbCompressedBytes--;
-   }
-#else
-   celt_assert(st->signalling==0);
-#endif
-
-   /* Can't produce more than 1275 output bytes */
-   nbCompressedBytes = IMIN(nbCompressedBytes,1275);
-   nbAvailableBytes = nbCompressedBytes - nbFilledBytes;
-
-   if (st->vbr && st->bitrate!=OPUS_BITRATE_MAX)
-   {
-      opus_int32 den=st->mode->Fs>>BITRES;
-      vbr_rate=(st->bitrate*frame_size+(den>>1))/den;
-#ifdef CUSTOM_MODES
-      if (st->signalling)
-         vbr_rate -= 8<<BITRES;
-#endif
-      effectiveBytes = vbr_rate>>(3+BITRES);
-   } else {
-      opus_int32 tmp;
-      vbr_rate = 0;
-      tmp = st->bitrate*frame_size;
-      if (tell>1)
-         tmp += tell;
-      if (st->bitrate!=OPUS_BITRATE_MAX)
-         nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes,
-               (tmp+4*st->mode->Fs)/(8*st->mode->Fs)-!!st->signalling));
-      effectiveBytes = nbCompressedBytes;
-   }
-
-   if (enc==NULL)
-   {
-      ec_enc_init(&_enc, compressed, nbCompressedBytes);
-      enc = &_enc;
-   }
-
-   if (vbr_rate>0)
-   {
-      /* Computes the max bit-rate allowed in VBR mode to avoid violating the
-          target rate and buffering.
-         We must do this up front so that bust-prevention logic triggers
-          correctly if we don't have enough bits. */
-      if (st->constrained_vbr)
-      {
-         opus_int32 vbr_bound;
-         opus_int32 max_allowed;
-         /* We could use any multiple of vbr_rate as bound (depending on the
-             delay).
-            This is clamped to ensure we use at least two bytes if the encoder
-             was entirely empty, but to allow 0 in hybrid mode. */
-         vbr_bound = vbr_rate;
-         max_allowed = IMIN(IMAX(tell==1?2:0,
-               (vbr_rate+vbr_bound-st->vbr_reservoir)>>(BITRES+3)),
-               nbAvailableBytes);
-         if(max_allowed < nbAvailableBytes)
-         {
-            nbCompressedBytes = nbFilledBytes+max_allowed;
-            nbAvailableBytes = max_allowed;
-            ec_enc_shrink(enc, nbCompressedBytes);
-         }
-      }
-   }
-   total_bits = nbCompressedBytes*8;
-
-   effEnd = st->end;
-   if (effEnd > st->mode->effEBands)
-      effEnd = st->mode->effEBands;
-
-   ALLOC(in, CC*(N+st->overlap), celt_sig);
-
-   /* Find pitch period and gain */
-   {
-      VARDECL(celt_sig, _pre);
-      celt_sig *pre[2];
-      SAVE_STACK;
-      ALLOC(_pre, CC*(N+COMBFILTER_MAXPERIOD), celt_sig);
-
-      pre[0] = _pre;
-      pre[1] = _pre + (N+COMBFILTER_MAXPERIOD);
-
-      silence = 1;
-      c=0; do {
-         int count = 0;
-         const opus_val16 * OPUS_RESTRICT pcmp = pcm+c;
-         celt_sig * OPUS_RESTRICT inp = in+c*(N+st->overlap)+st->overlap;
-
-         for (i=0;i<N;i++)
-         {
-            celt_sig x, tmp;
-
-            x = SCALEIN(*pcmp);
-#ifndef FIXED_POINT
-            if (!(x==x))
-               x = 0;
-            if (st->clip)
-               x = MAX32(-65536.f, MIN32(65536.f,x));
-#endif
-            if (++count==st->upsample)
-            {
-               count=0;
-               pcmp+=CC;
-            } else {
-               x = 0;
-            }
-            /* Apply pre-emphasis */
-            tmp = MULT16_16(st->mode->preemph[2], x);
-            *inp = tmp + st->preemph_memE[c];
-            st->preemph_memE[c] = MULT16_32_Q15(st->mode->preemph[1], *inp)
-                                   - MULT16_32_Q15(st->mode->preemph[0], tmp);
-            silence = silence && *inp == 0;
-            inp++;
-         }
-         OPUS_COPY(pre[c], prefilter_mem+c*COMBFILTER_MAXPERIOD, COMBFILTER_MAXPERIOD);
-         OPUS_COPY(pre[c]+COMBFILTER_MAXPERIOD, in+c*(N+st->overlap)+st->overlap, N);
-      } while (++c<CC);
-
-#ifdef FUZZING
-      if ((rand()&0x3F)==0)
-         silence = 1;
-#endif
-      if (tell==1)
-         ec_enc_bit_logp(enc, silence, 15);
-      else
-         silence=0;
-      if (silence)
-      {
-         /*In VBR mode there is no need to send more than the minimum. */
-         if (vbr_rate>0)
-         {
-            effectiveBytes=nbCompressedBytes=IMIN(nbCompressedBytes, nbFilledBytes+2);
-            total_bits=nbCompressedBytes*8;
-            nbAvailableBytes=2;
-            ec_enc_shrink(enc, nbCompressedBytes);
-         }
-         /* Pretend we've filled all the remaining bits with zeros
-            (that's what the initialiser did anyway) */
-         tell = nbCompressedBytes*8;
-         enc->nbits_total+=tell-ec_tell(enc);
-      }
-      if (nbAvailableBytes>12*C && st->start==0 && !silence && !st->disable_pf && st->complexity >= 5)
-      {
-         VARDECL(opus_val16, pitch_buf);
-         ALLOC(pitch_buf, (COMBFILTER_MAXPERIOD+N)>>1, opus_val16);
-
-         pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC);
-         pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N,
-               COMBFILTER_MAXPERIOD-COMBFILTER_MINPERIOD, &pitch_index);
-         pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
-
-         gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
-               N, &pitch_index, st->prefilter_period, st->prefilter_gain);
-         if (pitch_index > COMBFILTER_MAXPERIOD-2)
-            pitch_index = COMBFILTER_MAXPERIOD-2;
-         gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1);
-         if (st->loss_rate>2)
-            gain1 = HALF32(gain1);
-         if (st->loss_rate>4)
-            gain1 = HALF32(gain1);
-         if (st->loss_rate>8)
-            gain1 = 0;
-         prefilter_tapset = st->tapset_decision;
-      } else {
-         gain1 = 0;
-      }
-
-      /* Gain threshold for enabling the prefilter/postfilter */
-      pf_threshold = QCONST16(.2f,15);
-
-      /* Adjusting the threshold based on rate and continuity */
-      if (abs(pitch_index-st->prefilter_period)*10>pitch_index)
-         pf_threshold += QCONST16(.2f,15);
-      if (nbAvailableBytes<25)
-         pf_threshold += QCONST16(.1f,15);
-      if (nbAvailableBytes<35)
-         pf_threshold += QCONST16(.1f,15);
-      if (st->prefilter_gain > QCONST16(.4f,15))
-         pf_threshold -= QCONST16(.1f,15);
-      if (st->prefilter_gain > QCONST16(.55f,15))
-         pf_threshold -= QCONST16(.1f,15);
-
-      /* Hard threshold at 0.2 */
-      pf_threshold = MAX16(pf_threshold, QCONST16(.2f,15));
-      if (gain1<pf_threshold)
-      {
-         if(st->start==0 && tell+16<=total_bits)
-            ec_enc_bit_logp(enc, 0, 1);
-         gain1 = 0;
-         pf_on = 0;
-      } else {
-         /*This block is not gated by a total bits check only because
-           of the nbAvailableBytes check above.*/
-         int qg;
-         int octave;
-
-         if (ABS16(gain1-st->prefilter_gain)<QCONST16(.1f,15))
-            gain1=st->prefilter_gain;
-
-#ifdef FIXED_POINT
-         qg = ((gain1+1536)>>10)/3-1;
-#else
-         qg = (int)floor(.5f+gain1*32/3)-1;
-#endif
-         qg = IMAX(0, IMIN(7, qg));
-         ec_enc_bit_logp(enc, 1, 1);
-         pitch_index += 1;
-         octave = EC_ILOG(pitch_index)-5;
-         ec_enc_uint(enc, octave, 6);
-         ec_enc_bits(enc, pitch_index-(16<<octave), 4+octave);
-         pitch_index -= 1;
-         ec_enc_bits(enc, qg, 3);
-         if (ec_tell(enc)+2<=total_bits)
-            ec_enc_icdf(enc, prefilter_tapset, tapset_icdf, 2);
-         else
-           prefilter_tapset = 0;
-         gain1 = QCONST16(0.09375f,15)*(qg+1);
-         pf_on = 1;
-      }
-      /*printf("%d %f\n", pitch_index, gain1);*/
-
-      c=0; do {
-         int offset = st->mode->shortMdctSize-st->mode->overlap;
-         st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD);
-         OPUS_COPY(in+c*(N+st->overlap), st->in_mem+c*(st->overlap), st->overlap);
-         if (offset)
-            comb_filter(in+c*(N+st->overlap)+st->overlap, pre[c]+COMBFILTER_MAXPERIOD,
-                  st->prefilter_period, st->prefilter_period, offset, -st->prefilter_gain, -st->prefilter_gain,
-                  st->prefilter_tapset, st->prefilter_tapset, NULL, 0);
-
-         comb_filter(in+c*(N+st->overlap)+st->overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset,
-               st->prefilter_period, pitch_index, N-offset, -st->prefilter_gain, -gain1,
-               st->prefilter_tapset, prefilter_tapset, st->mode->window, st->mode->overlap);
-         OPUS_COPY(st->in_mem+c*(st->overlap), in+c*(N+st->overlap)+N, st->overlap);
-
-         if (N>COMBFILTER_MAXPERIOD)
-         {
-            OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD, pre[c]+N, COMBFILTER_MAXPERIOD);
-         } else {
-            OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD, prefilter_mem+c*COMBFILTER_MAXPERIOD+N, COMBFILTER_MAXPERIOD-N);
-            OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD+COMBFILTER_MAXPERIOD-N, pre[c]+COMBFILTER_MAXPERIOD, N);
-         }
-      } while (++c<CC);
-
-      RESTORE_STACK;
-   }
-
-   isTransient = 0;
-   shortBlocks = 0;
-   if (LM>0 && ec_tell(enc)+3<=total_bits)
-   {
-      if (st->complexity > 1)
-      {
-         isTransient = transient_analysis(in, N+st->overlap, CC,
-                  st->overlap);
-         if (isTransient)
-            shortBlocks = M;
-      }
-      ec_enc_bit_logp(enc, isTransient, 3);
-   }
-
-   ALLOC(freq, CC*N, celt_sig); /**< Interleaved signal MDCTs */
-   ALLOC(bandE,st->mode->nbEBands*CC, celt_ener);
-   ALLOC(bandLogE,st->mode->nbEBands*CC, opus_val16);
-   /* Compute MDCTs */
-   compute_mdcts(st->mode, shortBlocks, in, freq, CC, LM);
-
-   if (CC==2&&C==1)
-   {
-      for (i=0;i<N;i++)
-         freq[i] = ADD32(HALF32(freq[i]), HALF32(freq[N+i]));
-   }
-   if (st->upsample != 1)
-   {
-      c=0; do
-      {
-         int bound = N/st->upsample;
-         for (i=0;i<bound;i++)
-            freq[c*N+i] *= st->upsample;
-         for (;i<N;i++)
-            freq[c*N+i] = 0;
-      } while (++c<C);
-   }
-   ALLOC(X, C*N, celt_norm);         /**< Interleaved normalised MDCTs */
-
-   compute_band_energies(st->mode, freq, bandE, effEnd, C, M);
-
-   amp2Log2(st->mode, effEnd, st->end, bandE, bandLogE, C);
-
-   /* Band normalisation */
-   normalise_bands(st->mode, freq, X, bandE, effEnd, C, M);
-
-   ALLOC(tf_res, st->mode->nbEBands, int);
-   tf_select = tf_analysis(st->mode, effEnd, C, isTransient, tf_res, effectiveBytes, X, N, LM, st->start, &tf_sum);
-   for (i=effEnd;i<st->end;i++)
-      tf_res[i] = tf_res[effEnd-1];
-
-   ALLOC(error, C*st->mode->nbEBands, opus_val16);
-   quant_coarse_energy(st->mode, st->start, st->end, effEnd, bandLogE,
-         oldBandE, total_bits, error, enc,
-         C, LM, nbAvailableBytes, st->force_intra,
-         &st->delayedIntra, st->complexity >= 4, st->loss_rate);
-
-   tf_encode(st->start, st->end, isTransient, tf_res, LM, tf_select, enc);
-
-   if (ec_tell(enc)+4<=total_bits)
-   {
-      if (shortBlocks || st->complexity < 3 
-          || nbAvailableBytes < 10*C || st->start!=0)
-      {
-         if (st->complexity == 0)
-            st->spread_decision = SPREAD_NONE;
-         else
-            st->spread_decision = SPREAD_NORMAL;
-      } else {
-         st->spread_decision = spreading_decision(st->mode, X,
-               &st->tonal_average, st->spread_decision, &st->hf_average,
-               &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M);
-      }
-      ec_enc_icdf(enc, st->spread_decision, spread_icdf, 5);
-   }
-
-   ALLOC(cap, st->mode->nbEBands, int);
-   ALLOC(offsets, st->mode->nbEBands, int);
-
-   init_caps(st->mode,cap,LM,C);
-   for (i=0;i<st->mode->nbEBands;i++)
-      offsets[i] = 0;
-   /* Dynamic allocation code */
-   /* Make sure that dynamic allocation can't make us bust the budget */
-   if (effectiveBytes > 50 && LM>=1)
-   {
-      int t1, t2;
-      if (LM <= 1)
-      {
-         t1 = 3;
-         t2 = 5;
-      } else {
-         t1 = 2;
-         t2 = 4;
-      }
-      for (i=st->start+1;i<st->end-1;i++)
-      {
-         opus_val32 d2;
-         d2 = 2*bandLogE[i]-bandLogE[i-1]-bandLogE[i+1];
-         if (C==2)
-            d2 = HALF32(d2 + 2*bandLogE[i+st->mode->nbEBands]-
-                  bandLogE[i-1+st->mode->nbEBands]-bandLogE[i+1+st->mode->nbEBands]);
-#ifdef FUZZING
-         if((rand()&0xF)==0)
-         {
-            offsets[i] += 1;
-            if((rand()&0x3)==0)
-               offsets[i] += 1+(rand()&0x3);
-         }
-#else
-         if (d2 > SHL16(t1,DB_SHIFT))
-            offsets[i] += 1;
-         if (d2 > SHL16(t2,DB_SHIFT))
-            offsets[i] += 1;
-#endif
-      }
-   }
-   dynalloc_logp = 6;
-   total_bits<<=BITRES;
-   total_boost = 0;
-   tell = ec_tell_frac(enc);
-   for (i=st->start;i<st->end;i++)
-   {
-      int width, quanta;
-      int dynalloc_loop_logp;
-      int boost;
-      int j;
-      width = C*(st->mode->eBands[i+1]-st->mode->eBands[i])<<LM;
-      /* quanta is 6 bits, but no more than 1 bit/sample
-         and no less than 1/8 bit/sample */
-      quanta = IMIN(width<<BITRES, IMAX(6<<BITRES, width));
-      dynalloc_loop_logp = dynalloc_logp;
-      boost = 0;
-      for (j = 0; tell+(dynalloc_loop_logp<<BITRES) < total_bits-total_boost
-            && boost < cap[i]; j++)
-      {
-         int flag;
-         flag = j<offsets[i];
-         ec_enc_bit_logp(enc, flag, dynalloc_loop_logp);
-         tell = ec_tell_frac(enc);
-         if (!flag)
-            break;
-         boost += quanta;
-         total_boost += quanta;
-         dynalloc_loop_logp = 1;
-      }
-      /* Making dynalloc more likely */
-      if (j)
-         dynalloc_logp = IMAX(2, dynalloc_logp-1);
-      offsets[i] = boost;
-   }
-   alloc_trim = 5;
-   if (tell+(6<<BITRES) <= total_bits - total_boost)
-   {
-      alloc_trim = alloc_trim_analysis(st->mode, X, bandLogE,
-            st->end, LM, C, N);
-      ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
-      tell = ec_tell_frac(enc);
-   }
-
-   /* Variable bitrate */
-   if (vbr_rate>0)
-   {
-     opus_val16 alpha;
-     opus_int32 delta;
-     /* The target rate in 8th bits per frame */
-     opus_int32 target;
-     opus_int32 min_allowed;
-     int lm_diff = st->mode->maxLM - LM;
-
-     /* Don't attempt to use more than 510 kb/s, even for frames smaller than 20 ms.
-        The CELT allocator will just not be able to use more than that anyway. */
-     nbCompressedBytes = IMIN(nbCompressedBytes,1275>>(3-LM));
-     target = vbr_rate + (st->vbr_offset>>lm_diff) - ((40*C+20)<<BITRES);
-
-     /* Shortblocks get a large boost in bitrate, but since they
-        are uncommon long blocks are not greatly affected */
-     if (shortBlocks || tf_sum < -2*(st->end-st->start))
-        target = 7*target/4;
-     else if (tf_sum < -(st->end-st->start))
-        target = 3*target/2;
-     else if (M > 1)
-        target-=(target+14)/28;
-
-     /* The current offset is removed from the target and the space used
-        so far is added*/
-     target=target+tell;
-
-     /* In VBR mode the frame size must not be reduced so much that it would
-         result in the encoder running out of bits.
-        The margin of 2 bytes ensures that none of the bust-prevention logic
-         in the decoder will have triggered so far. */
-     min_allowed = ((tell+total_boost+(1<<(BITRES+3))-1)>>(BITRES+3)) + 2 - nbFilledBytes;
-
-     nbAvailableBytes = (target+(1<<(BITRES+2)))>>(BITRES+3);
-     nbAvailableBytes = IMAX(min_allowed,nbAvailableBytes);
-     nbAvailableBytes = IMIN(nbCompressedBytes,nbAvailableBytes+nbFilledBytes) - nbFilledBytes;
-
-     /* By how much did we "miss" the target on that frame */
-     delta = target - vbr_rate;
-
-     target=nbAvailableBytes<<(BITRES+3);
-
-     /*If the frame is silent we don't adjust our drift, otherwise
-       the encoder will shoot to very high rates after hitting a
-       span of silence, but we do allow the bitres to refill.
-       This means that we'll undershoot our target in CVBR/VBR modes
-       on files with lots of silence. */
-     if(silence)
-     {
-       nbAvailableBytes = 2;
-       target = 2*8<<BITRES;
-       delta = 0;
-     }
-
-     if (st->vbr_count < 970)
-     {
-        st->vbr_count++;
-        alpha = celt_rcp(SHL32(EXTEND32(st->vbr_count+20),16));
-     } else
-        alpha = QCONST16(.001f,15);
-     /* How many bits have we used in excess of what we're allowed */
-     if (st->constrained_vbr)
-        st->vbr_reservoir += target - vbr_rate;
-     /*printf ("%d\n", st->vbr_reservoir);*/
-
-     /* Compute the offset we need to apply in order to reach the target */
-     st->vbr_drift += (opus_int32)MULT16_32_Q15(alpha,(delta*(1<<lm_diff))-st->vbr_offset-st->vbr_drift);
-     st->vbr_offset = -st->vbr_drift;
-     /*printf ("%d\n", st->vbr_drift);*/
-
-     if (st->constrained_vbr && st->vbr_reservoir < 0)
-     {
-        /* We're under the min value -- increase rate */
-        int adjust = (-st->vbr_reservoir)/(8<<BITRES);
-        /* Unless we're just coding silence */
-        nbAvailableBytes += silence?0:adjust;
-        st->vbr_reservoir = 0;
-        /*printf ("+%d\n", adjust);*/
-     }
-     nbCompressedBytes = IMIN(nbCompressedBytes,nbAvailableBytes+nbFilledBytes);
-     /* This moves the raw bits to take into account the new compressed size */
-     ec_enc_shrink(enc, nbCompressedBytes);
-   }
-   if (C==2)
-   {
-      int effectiveRate;
-
-      /* Always use MS for 2.5 ms frames until we can do a better analysis */
-      if (LM!=0)
-         dual_stereo = stereo_analysis(st->mode, X, LM, N);
-
-      /* Account for coarse energy */
-      effectiveRate = (8*effectiveBytes - 80)>>LM;
-
-      /* effectiveRate in kb/s */
-      effectiveRate = 2*effectiveRate/5;
-      if (effectiveRate<35)
-         intensity = 8;
-      else if (effectiveRate<50)
-         intensity = 12;
-      else if (effectiveRate<68)
-         intensity = 16;
-      else if (effectiveRate<84)
-         intensity = 18;
-      else if (effectiveRate<102)
-         intensity = 19;
-      else if (effectiveRate<130)
-         intensity = 20;
-      else
-         intensity = 100;
-      intensity = IMIN(st->end,IMAX(st->start, intensity));
-   }
-
-   /* Bit allocation */
-   ALLOC(fine_quant, st->mode->nbEBands, int);
-   ALLOC(pulses, st->mode->nbEBands, int);
-   ALLOC(fine_priority, st->mode->nbEBands, int);
-
-   /* bits =           packet size                    - where we are - safety*/
-   bits = (((opus_int32)nbCompressedBytes*8)<<BITRES) - ec_tell_frac(enc) - 1;
-   anti_collapse_rsv = isTransient&&LM>=2&&bits>=((LM+2)<<BITRES) ? (1<<BITRES) : 0;
-   bits -= anti_collapse_rsv;
-   codedBands = compute_allocation(st->mode, st->start, st->end, offsets, cap,
-         alloc_trim, &intensity, &dual_stereo, bits, &balance, pulses,
-         fine_quant, fine_priority, C, LM, enc, 1, st->lastCodedBands);
-   st->lastCodedBands = codedBands;
-
-   quant_fine_energy(st->mode, st->start, st->end, oldBandE, error, fine_quant, enc, C);
-
-#ifdef MEASURE_NORM_MSE
-   float X0[3000];
-   float bandE0[60];
-   c=0; do
-      for (i=0;i<N;i++)
-         X0[i+c*N] = X[i+c*N];
-   while (++c<C);
-   for (i=0;i<C*st->mode->nbEBands;i++)
-      bandE0[i] = bandE[i];
-#endif
-
-   /* Residual quantisation */
-   ALLOC(collapse_masks, C*st->mode->nbEBands, unsigned char);
-   quant_all_bands(1, st->mode, st->start, st->end, X, C==2 ? X+N : NULL, collapse_masks,
-         bandE, pulses, shortBlocks, st->spread_decision, dual_stereo, intensity, tf_res,
-         nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv, balance, enc, LM, codedBands, &st->rng);
-
-   if (anti_collapse_rsv > 0)
-   {
-      anti_collapse_on = st->consec_transient<2;
-#ifdef FUZZING
-      anti_collapse_on = rand()&0x1;
-#endif
-      ec_enc_bits(enc, anti_collapse_on, 1);
-   }
-   quant_energy_finalise(st->mode, st->start, st->end, oldBandE, error, fine_quant, fine_priority, nbCompressedBytes*8-ec_tell(enc), enc, C);
-
-   if (silence)
-   {
-      for (i=0;i<C*st->mode->nbEBands;i++)
-         oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
-   }
-
-#ifdef RESYNTH
-   /* Re-synthesis of the coded audio if required */
-   {
-      celt_sig *out_mem[2];
-      celt_sig *overlap_mem[2];
-
-      log2Amp(st->mode, st->start, st->end, bandE, oldBandE, C);
-      if (silence)
-      {
-         for (i=0;i<C*st->mode->nbEBands;i++)
-            bandE[i] = 0;
-      }
-
-#ifdef MEASURE_NORM_MSE
-      measure_norm_mse(st->mode, X, X0, bandE, bandE0, M, N, C);
-#endif
-      if (anti_collapse_on)
-      {
-         anti_collapse(st->mode, X, collapse_masks, LM, C, N,
-               st->start, st->end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
-      }
-
-      /* Synthesis */
-      denormalise_bands(st->mode, X, freq, bandE, effEnd, C, M);
-
-      OPUS_MOVE(st->syn_mem[0], st->syn_mem[0]+N, MAX_PERIOD);
-      if (CC==2)
-         OPUS_MOVE(st->syn_mem[1], st->syn_mem[1]+N, MAX_PERIOD);
-
-      c=0; do
-         for (i=0;i<M*st->mode->eBands[st->start];i++)
-            freq[c*N+i] = 0;
-      while (++c<C);
-      c=0; do
-         for (i=M*st->mode->eBands[st->end];i<N;i++)
-            freq[c*N+i] = 0;
-      while (++c<C);
-
-      if (CC==2&&C==1)
-      {
-         for (i=0;i<N;i++)
-            freq[N+i] = freq[i];
-      }
-
-      out_mem[0] = st->syn_mem[0]+MAX_PERIOD;
-      if (CC==2)
-         out_mem[1] = st->syn_mem[1]+MAX_PERIOD;
-
-      overlap_mem[0] = (celt_sig*)(oldLogE2 + CC*st->mode->nbEBands);
-      if (CC==2)
-         overlap_mem[1] = overlap_mem[0] + st->overlap;
-
-      compute_inv_mdcts(st->mode, shortBlocks, freq, out_mem, overlap_mem, CC, LM);
-
-      c=0; do {
-         st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD);
-         st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD);
-         comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, st->mode->shortMdctSize,
-               st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset,
-               st->mode->window, st->overlap);
-         if (LM!=0)
-            comb_filter(out_mem[c]+st->mode->shortMdctSize, out_mem[c]+st->mode->shortMdctSize, st->prefilter_period, pitch_index, N-st->mode->shortMdctSize,
-                  st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset,
-                  st->mode->window, st->mode->overlap);
-      } while (++c<CC);
-
-      deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, st->mode->preemph, st->preemph_memD);
-      st->prefilter_period_old = st->prefilter_period;
-      st->prefilter_gain_old = st->prefilter_gain;
-      st->prefilter_tapset_old = st->prefilter_tapset;
-   }
-#endif
-
-   st->prefilter_period = pitch_index;
-   st->prefilter_gain = gain1;
-   st->prefilter_tapset = prefilter_tapset;
-#ifdef RESYNTH
-   if (LM!=0)
-   {
-      st->prefilter_period_old = st->prefilter_period;
-      st->prefilter_gain_old = st->prefilter_gain;
-      st->prefilter_tapset_old = st->prefilter_tapset;
-   }
-#endif
-
-   if (CC==2&&C==1) {
-      for (i=0;i<st->mode->nbEBands;i++)
-         oldBandE[st->mode->nbEBands+i]=oldBandE[i];
-   }
-
-   if (!isTransient)
-   {
-      for (i=0;i<CC*st->mode->nbEBands;i++)
-         oldLogE2[i] = oldLogE[i];
-      for (i=0;i<CC*st->mode->nbEBands;i++)
-         oldLogE[i] = oldBandE[i];
-   } else {
-      for (i=0;i<CC*st->mode->nbEBands;i++)
-         oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
-   }
-   /* In case start or end were to change */
-   c=0; do
-   {
-      for (i=0;i<st->start;i++)
-      {
-         oldBandE[c*st->mode->nbEBands+i]=0;
-         oldLogE[c*st->mode->nbEBands+i]=oldLogE2[c*st->mode->nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
-      }
-      for (i=st->end;i<st->mode->nbEBands;i++)
-      {
-         oldBandE[c*st->mode->nbEBands+i]=0;
-         oldLogE[c*st->mode->nbEBands+i]=oldLogE2[c*st->mode->nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
-      }
-   } while (++c<CC);
-
-   if (isTransient)
-      st->consec_transient++;
-   else
-      st->consec_transient=0;
-   st->rng = enc->rng;
-
-   /* If there's any room left (can only happen for very high rates),
-      it's already filled with zeros */
-   ec_enc_done(enc);
-
-#ifdef CUSTOM_MODES
-   if (st->signalling)
-      nbCompressedBytes++;
-#endif
-
-   RESTORE_STACK;
-   if (ec_get_error(enc))
-      return OPUS_INTERNAL_ERROR;
-   else
-      return nbCompressedBytes;
-}
-
-
-#ifdef CUSTOM_MODES
-
-#ifdef FIXED_POINT
-int opus_custom_encode(CELTEncoder * OPUS_RESTRICT st, const opus_int16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
-{
-   return celt_encode_with_ec(st, pcm, frame_size, compressed, nbCompressedBytes, NULL);
-}
-
-#ifndef DISABLE_FLOAT_API
-int opus_custom_encode_float(CELTEncoder * OPUS_RESTRICT st, const float * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
-{
-   int j, ret, C, N;
-   VARDECL(opus_int16, in);
-   ALLOC_STACK;
-
-   if (pcm==NULL)
-      return OPUS_BAD_ARG;
-
-   C = st->channels;
-   N = frame_size;
-   ALLOC(in, C*N, opus_int16);
-
-   for (j=0;j<C*N;j++)
-     in[j] = FLOAT2INT16(pcm[j]);
-
-   ret=celt_encode_with_ec(st,in,frame_size,compressed,nbCompressedBytes, NULL);
-#ifdef RESYNTH
-   for (j=0;j<C*N;j++)
-      ((float*)pcm)[j]=in[j]*(1.f/32768.f);
-#endif
-   RESTORE_STACK;
-   return ret;
-}
-#endif /* DISABLE_FLOAT_API */
-#else
-
-int opus_custom_encode(CELTEncoder * OPUS_RESTRICT st, const opus_int16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
-{
-   int j, ret, C, N;
-   VARDECL(celt_sig, in);
-   ALLOC_STACK;
-
-   if (pcm==NULL)
-      return OPUS_BAD_ARG;
-
-   C=st->channels;
-   N=frame_size;
-   ALLOC(in, C*N, celt_sig);
-   for (j=0;j<C*N;j++) {
-     in[j] = SCALEOUT(pcm[j]);
-   }
-
-   ret = celt_encode_with_ec(st,in,frame_size,compressed,nbCompressedBytes, NULL);
-#ifdef RESYNTH
-   for (j=0;j<C*N;j++)
-      ((opus_int16*)pcm)[j] = FLOAT2INT16(in[j]);
-#endif
-   RESTORE_STACK;
-   return ret;
-}
-
-int opus_custom_encode_float(CELTEncoder * OPUS_RESTRICT st, const float * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
-{
-   return celt_encode_with_ec(st, pcm, frame_size, compressed, nbCompressedBytes, NULL);
-}
-
-#endif
-
-#endif /* CUSTOM_MODES */
-
-int opus_custom_encoder_ctl(CELTEncoder * OPUS_RESTRICT st, int request, ...)
-{
-   va_list ap;
-
-   va_start(ap, request);
-   switch (request)
-   {
-      case OPUS_SET_COMPLEXITY_REQUEST:
-      {
-         int value = va_arg(ap, opus_int32);
-         if (value<0 || value>10)
-            goto bad_arg;
-         st->complexity = value;
-      }
-      break;
-      case CELT_SET_START_BAND_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         if (value<0 || value>=st->mode->nbEBands)
-            goto bad_arg;
-         st->start = value;
-      }
-      break;
-      case CELT_SET_END_BAND_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         if (value<1 || value>st->mode->nbEBands)
-            goto bad_arg;
-         st->end = value;
-      }
-      break;
-      case CELT_SET_PREDICTION_REQUEST:
-      {
-         int value = va_arg(ap, opus_int32);
-         if (value<0 || value>2)
-            goto bad_arg;
-         st->disable_pf = value<=1;
-         st->force_intra = value==0;
-      }
-      break;
-      case OPUS_SET_PACKET_LOSS_PERC_REQUEST:
-      {
-         int value = va_arg(ap, opus_int32);
-         if (value<0 || value>100)
-            goto bad_arg;
-         st->loss_rate = value;
-      }
-      break;
-      case OPUS_SET_VBR_CONSTRAINT_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         st->constrained_vbr = value;
-      }
-      break;
-      case OPUS_SET_VBR_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         st->vbr = value;
-      }
-      break;
-      case OPUS_SET_BITRATE_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         if (value<=500 && value!=OPUS_BITRATE_MAX)
-            goto bad_arg;
-         value = IMIN(value, 260000*st->channels);
-         st->bitrate = value;
-      }
-      break;
-      case CELT_SET_CHANNELS_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         if (value<1 || value>2)
-            goto bad_arg;
-         st->stream_channels = value;
-      }
-      break;
-      case OPUS_SET_LSB_DEPTH_REQUEST:
-      {
-          opus_int32 value = va_arg(ap, opus_int32);
-          if (value<8 || value>24)
-             goto bad_arg;
-          st->lsb_depth=value;
-      }
-      break;
-      case OPUS_GET_LSB_DEPTH_REQUEST:
-      {
-          opus_int32 *value = va_arg(ap, opus_int32*);
-          *value=st->lsb_depth;
-      }
-      break;
-      case OPUS_RESET_STATE:
-      {
-         int i;
-         opus_val16 *oldBandE, *oldLogE, *oldLogE2;
-         oldBandE = (opus_val16*)(st->in_mem+st->channels*(st->overlap+COMBFILTER_MAXPERIOD));
-         oldLogE = oldBandE + st->channels*st->mode->nbEBands;
-         oldLogE2 = oldLogE + st->channels*st->mode->nbEBands;
-         OPUS_CLEAR((char*)&st->ENCODER_RESET_START,
-               opus_custom_encoder_get_size(st->mode, st->channels)-
-               ((char*)&st->ENCODER_RESET_START - (char*)st));
-         for (i=0;i<st->channels*st->mode->nbEBands;i++)
-            oldLogE[i]=oldLogE2[i]=-QCONST16(28.f,DB_SHIFT);
-         st->vbr_offset = 0;
-         st->delayedIntra = 1;
-         st->spread_decision = SPREAD_NORMAL;
-         st->tonal_average = 256;
-         st->hf_average = 0;
-         st->tapset_decision = 0;
-      }
-      break;
-#ifdef CUSTOM_MODES
-      case CELT_SET_INPUT_CLIPPING_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         st->clip = value;
-      }
-      break;
-#endif
-      case CELT_SET_SIGNALLING_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         st->signalling = value;
-      }
-      break;
-      case CELT_GET_MODE_REQUEST:
-      {
-         const CELTMode ** value = va_arg(ap, const CELTMode**);
-         if (value==0)
-            goto bad_arg;
-         *value=st->mode;
-      }
-      break;
-      case OPUS_GET_FINAL_RANGE_REQUEST:
-      {
-         opus_uint32 * value = va_arg(ap, opus_uint32 *);
-         if (value==0)
-            goto bad_arg;
-         *value=st->rng;
-      }
-      break;
-      default:
-         goto bad_request;
-   }
-   va_end(ap);
-   return OPUS_OK;
-bad_arg:
-   va_end(ap);
-   return OPUS_BAD_ARG;
-bad_request:
-   va_end(ap);
-   return OPUS_UNIMPLEMENTED;
-}
-
-/**********************************************************************/
-/*                                                                    */
-/*                             DECODER                                */
-/*                                                                    */
-/**********************************************************************/
-#define DECODE_BUFFER_SIZE 2048
-
-/** Decoder state
- @brief Decoder state
- */
-struct OpusCustomDecoder {
-   const OpusCustomMode *mode;
-   int overlap;
-   int channels;
-   int stream_channels;
-
-   int downsample;
-   int start, end;
-   int signalling;
-
-   /* Everything beyond this point gets cleared on a reset */
-#define DECODER_RESET_START rng
-
-   opus_uint32 rng;
-   int error;
-   int last_pitch_index;
-   int loss_count;
-   int postfilter_period;
-   int postfilter_period_old;
-   opus_val16 postfilter_gain;
-   opus_val16 postfilter_gain_old;
-   int postfilter_tapset;
-   int postfilter_tapset_old;
-
-   celt_sig preemph_memD[2];
-
-   celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */
-   /* opus_val16 lpc[],  Size = channels*LPC_ORDER */
-   /* opus_val16 oldEBands[], Size = 2*mode->nbEBands */
-   /* opus_val16 oldLogE[], Size = 2*mode->nbEBands */
-   /* opus_val16 oldLogE2[], Size = 2*mode->nbEBands */
-   /* opus_val16 backgroundLogE[], Size = 2*mode->nbEBands */
-};
-
-int celt_decoder_get_size(int channels)
-{
-   const CELTMode *mode = opus_custom_mode_create(48000, 960, NULL);
-   return opus_custom_decoder_get_size(mode, channels);
-}
-
-OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int channels)
-{
-   int size = sizeof(struct CELTDecoder)
-            + (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig)
-            + channels*LPC_ORDER*sizeof(opus_val16)
-            + 4*2*mode->nbEBands*sizeof(opus_val16);
-   return size;
-}
-
-#ifdef CUSTOM_MODES
-CELTDecoder *opus_custom_decoder_create(const CELTMode *mode, int channels, int *error)
-{
-   int ret;
-   CELTDecoder *st = (CELTDecoder *)opus_alloc(opus_custom_decoder_get_size(mode, channels));
-   ret = opus_custom_decoder_init(st, mode, channels);
-   if (ret != OPUS_OK)
-   {
-      opus_custom_decoder_destroy(st);
-      st = NULL;
-   }
-   if (error)
-      *error = ret;
-   return st;
-}
-#endif /* CUSTOM_MODES */
-
-int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels)
-{
-   int ret;
-   ret = opus_custom_decoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels);
-   if (ret != OPUS_OK)
-      return ret;
-   st->downsample = resampling_factor(sampling_rate);
-   if (st->downsample==0)
-      return OPUS_BAD_ARG;
-   else
-      return OPUS_OK;
-}
-
-OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_init(CELTDecoder *st, const CELTMode *mode, int channels)
-{
-   if (channels < 0 || channels > 2)
-      return OPUS_BAD_ARG;
-
-   if (st==NULL)
-      return OPUS_ALLOC_FAIL;
-
-   OPUS_CLEAR((char*)st, opus_custom_decoder_get_size(mode, channels));
-
-   st->mode = mode;
-   st->overlap = mode->overlap;
-   st->stream_channels = st->channels = channels;
-
-   st->downsample = 1;
-   st->start = 0;
-   st->end = st->mode->effEBands;
-   st->signalling = 1;
-
-   st->loss_count = 0;
-
-   opus_custom_decoder_ctl(st, OPUS_RESET_STATE);
-
-   return OPUS_OK;
-}
-
-#ifdef CUSTOM_MODES
-void opus_custom_decoder_destroy(CELTDecoder *st)
-{
-   opus_free(st);
-}
-#endif /* CUSTOM_MODES */
-
-static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_RESTRICT pcm, int N, int LM)
-{
-   int c;
-   int pitch_index;
-   opus_val16 fade = Q15ONE;
-   int i, len;
-   const int C = st->channels;
-   int offset;
-   celt_sig *out_mem[2];
-   celt_sig *decode_mem[2];
-   celt_sig *overlap_mem[2];
-   opus_val16 *lpc;
-   opus_val32 *out_syn[2];
-   opus_val16 *oldBandE, *oldLogE, *oldLogE2, *backgroundLogE;
-   const OpusCustomMode *mode;
-   int nbEBands;
-   int overlap;
-   const opus_int16 *eBands;
-   SAVE_STACK;
-
-   mode = st->mode;
-   nbEBands = mode->nbEBands;
-   overlap = mode->overlap;
-   eBands = mode->eBands;
-
-   c=0; do {
-      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+st->overlap);
-      out_mem[c] = decode_mem[c]+DECODE_BUFFER_SIZE-MAX_PERIOD;
-      overlap_mem[c] = decode_mem[c]+DECODE_BUFFER_SIZE;
-   } while (++c<C);
-   lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*C);
-   oldBandE = lpc+C*LPC_ORDER;
-   oldLogE = oldBandE + 2*nbEBands;
-   oldLogE2 = oldLogE + 2*nbEBands;
-   backgroundLogE = oldLogE2  + 2*nbEBands;
-
-   c=0; do {
-      out_syn[c] = out_mem[c]+MAX_PERIOD-N;
-   } while (++c<C);
-
-   len = N+overlap;
-
-   if (st->loss_count >= 5 || st->start!=0)
-   {
-      /* Noise-based PLC/CNG */
-      VARDECL(celt_sig, freq);
-      VARDECL(celt_norm, X);
-      VARDECL(celt_ener, bandE);
-      opus_uint32 seed;
-      int effEnd;
-
-      effEnd = st->end;
-      if (effEnd > mode->effEBands)
-         effEnd = mode->effEBands;
-
-      ALLOC(freq, C*N, celt_sig); /**< Interleaved signal MDCTs */
-      ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
-      ALLOC(bandE, nbEBands*C, celt_ener);
-
-      if (st->loss_count >= 5)
-         log2Amp(mode, st->start, st->end, bandE, backgroundLogE, C);
-      else {
-         /* Energy decay */
-         opus_val16 decay = st->loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
-         c=0; do
-         {
-            for (i=st->start;i<st->end;i++)
-               oldBandE[c*nbEBands+i] -= decay;
-         } while (++c<C);
-         log2Amp(mode, st->start, st->end, bandE, oldBandE, C);
-      }
-      seed = st->rng;
-      for (c=0;c<C;c++)
-      {
-         for (i=0;i<(st->mode->eBands[st->start]<<LM);i++)
-            X[c*N+i] = 0;
-         for (i=st->start;i<mode->effEBands;i++)
-         {
-            int j;
-            int boffs;
-            int blen;
-            boffs = N*c+(eBands[i]<<LM);
-            blen = (eBands[i+1]-eBands[i])<<LM;
-            for (j=0;j<blen;j++)
-            {
-               seed = celt_lcg_rand(seed);
-               X[boffs+j] = (celt_norm)((opus_int32)seed>>20);
-            }
-            renormalise_vector(X+boffs, blen, Q15ONE);
-         }
-         for (i=(st->mode->eBands[st->end]<<LM);i<N;i++)
-            X[c*N+i] = 0;
-      }
-      st->rng = seed;
-
-      denormalise_bands(mode, X, freq, bandE, mode->effEBands, C, 1<<LM);
-
-      c=0; do
-         for (i=0;i<st->mode->eBands[st->start]<<LM;i++)
-            freq[c*N+i] = 0;
-      while (++c<C);
-      c=0; do {
-         int bound = eBands[effEnd]<<LM;
-         if (st->downsample!=1)
-            bound = IMIN(bound, N/st->downsample);
-         for (i=bound;i<N;i++)
-            freq[c*N+i] = 0;
-      } while (++c<C);
-      c=0; do {
-         OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap);
-      } while (++c<C);
-      compute_inv_mdcts(mode, 0, freq, out_syn, overlap_mem, C, LM);
-   } else {
-      /* Pitch-based PLC */
-      VARDECL(opus_val32, etmp);
-
-      if (st->loss_count == 0)
-      {
-         opus_val16 pitch_buf[DECODE_BUFFER_SIZE>>1];
-         /* Corresponds to a min pitch of 67 Hz. It's possible to save CPU in this
-         search by using only part of the decode buffer */
-         int poffset = 720;
-         pitch_downsample(decode_mem, pitch_buf, DECODE_BUFFER_SIZE, C);
-         /* Max pitch is 100 samples (480 Hz) */
-         pitch_search(pitch_buf+((poffset)>>1), pitch_buf, DECODE_BUFFER_SIZE-poffset,
-               poffset-100, &pitch_index);
-         pitch_index = poffset-pitch_index;
-         st->last_pitch_index = pitch_index;
-      } else {
-         pitch_index = st->last_pitch_index;
-         fade = QCONST16(.8f,15);
-      }
-
-      ALLOC(etmp, overlap, opus_val32);
-      c=0; do {
-         opus_val16 exc[MAX_PERIOD];
-         opus_val32 ac[LPC_ORDER+1];
-         opus_val16 decay;
-         opus_val16 attenuation;
-         opus_val32 S1=0;
-         opus_val16 mem[LPC_ORDER]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-         opus_val32 *e = out_syn[c];
-
-
-         offset = MAX_PERIOD-pitch_index;
-         for (i=0;i<MAX_PERIOD;i++)
-            exc[i] = ROUND16(out_mem[c][i], SIG_SHIFT);
-
-         /* Compute LPC coefficients for the last MAX_PERIOD samples before the loss so we can
-            work in the excitation-filter domain */
-         if (st->loss_count == 0)
-         {
-            _celt_autocorr(exc, ac, mode->window, overlap,
-                  LPC_ORDER, MAX_PERIOD);
-
-            /* Noise floor -40 dB */
-#ifdef FIXED_POINT
-            ac[0] += SHR32(ac[0],13);
-#else
-            ac[0] *= 1.0001f;
-#endif
-            /* Lag windowing */
-            for (i=1;i<=LPC_ORDER;i++)
-            {
-               /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
-#ifdef FIXED_POINT
-               ac[i] -= MULT16_32_Q15(2*i*i, ac[i]);
-#else
-               ac[i] -= ac[i]*(.008f*i)*(.008f*i);
-#endif
-            }
-
-            _celt_lpc(lpc+c*LPC_ORDER, ac, LPC_ORDER);
-         }
-         /* Samples just before the beginning of exc  */
-         for (i=0;i<LPC_ORDER;i++)
-            mem[i] = ROUND16(out_mem[c][-1-i], SIG_SHIFT);
-         /* Compute the excitation for MAX_PERIOD samples before the loss */
-         celt_fir(exc, lpc+c*LPC_ORDER, exc, MAX_PERIOD, LPC_ORDER, mem);
-
-         /* Check if the waveform is decaying (and if so how fast)
-            We do this to avoid adding energy when concealing in a segment
-            with decaying energy */
-         {
-            opus_val32 E1=1, E2=1;
-            int period;
-            if (pitch_index <= MAX_PERIOD/2)
-               period = pitch_index;
-            else
-               period = MAX_PERIOD/2;
-            for (i=0;i<period;i++)
-            {
-               E1 += SHR32(MULT16_16(exc[MAX_PERIOD-period+i],exc[MAX_PERIOD-period+i]),8);
-               E2 += SHR32(MULT16_16(exc[MAX_PERIOD-2*period+i],exc[MAX_PERIOD-2*period+i]),8);
-            }
-            if (E1 > E2)
-               E1 = E2;
-            decay = celt_sqrt(frac_div32(SHR32(E1,1),E2));
-            attenuation = decay;
-         }
-
-         /* Move memory one frame to the left */
-         OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap);
-
-         /* Extrapolate excitation with the right period, taking decay into account */
-         for (i=0;i<len;i++)
-         {
-            opus_val16 tmp;
-            if (offset+i >= MAX_PERIOD)
-            {
-               offset -= pitch_index;
-               attenuation = MULT16_16_Q15(attenuation, decay);
-            }
-            e[i] = SHL32(EXTEND32(MULT16_16_Q15(attenuation, exc[offset+i])), SIG_SHIFT);
-            /* Compute the energy of the previously decoded signal whose
-               excitation we're copying */
-            tmp = ROUND16(out_mem[c][-N+offset+i],SIG_SHIFT);
-            S1 += SHR32(MULT16_16(tmp,tmp),8);
-         }
-
-         /* Copy the last decoded samples (prior to the overlap region) to
-            synthesis filter memory so we can have a continuous signal. */
-         for (i=0;i<LPC_ORDER;i++)
-            mem[i] = ROUND16(out_mem[c][MAX_PERIOD-N-1-i], SIG_SHIFT);
-         /* Apply the fading if not the first loss */
-         for (i=0;i<len;i++)
-            e[i] = MULT16_32_Q15(fade, e[i]);
-         /* Synthesis filter -- back in the signal domain */
-         celt_iir(e, lpc+c*LPC_ORDER, e, len, LPC_ORDER, mem);
-
-         /* Check if the synthesis energy is higher than expected, which can
-            happen with the signal changes during our window. If so, attenuate. */
-         {
-            opus_val32 S2=0;
-            for (i=0;i<len;i++)
-            {
-               opus_val16 tmp = ROUND16(e[i],SIG_SHIFT);
-               S2 += SHR32(MULT16_16(tmp,tmp),8);
-            }
-            /* This checks for an "explosion" in the synthesis */
-#ifdef FIXED_POINT
-            if (!(S1 > SHR32(S2,2)))
-#else
-            /* Float test is written this way to catch NaNs at the same time */
-            if (!(S1 > 0.2f*S2))
-#endif
-            {
-               for (i=0;i<len;i++)
-                  e[i] = 0;
-            } else if (S1 < S2)
-            {
-               opus_val16 ratio = celt_sqrt(frac_div32(SHR32(S1,1)+1,S2+1));
-               for (i=0;i<overlap;i++)
-               {
-                  opus_val16 tmp_g = Q15ONE - MULT16_16_Q15(mode->window[i], Q15ONE-ratio);
-                  e[i] = MULT16_32_Q15(tmp_g, e[i]);
-               }
-               for (i=overlap;i<len;i++)
-                  e[i] = MULT16_32_Q15(ratio, e[i]);
-            }
-         }
-
-         /* Apply pre-filter to the MDCT overlap for the next frame because the
-            post-filter will be re-applied in the decoder after the MDCT overlap */
-         comb_filter(etmp, out_mem[c]+MAX_PERIOD, st->postfilter_period, st->postfilter_period, st->overlap,
-               -st->postfilter_gain, -st->postfilter_gain, st->postfilter_tapset, st->postfilter_tapset,
-               NULL, 0);
-
-         /* Simulate TDAC on the concealed audio so that it blends with the
-            MDCT of next frames. */
-         for (i=0;i<overlap/2;i++)
-         {
-            opus_val32 tmp;
-            tmp = MULT16_32_Q15(mode->window[i],           etmp[overlap-1-i]) +
-                  MULT16_32_Q15(mode->window[overlap-i-1], etmp[i          ]);
-            out_mem[c][MAX_PERIOD+i] = MULT16_32_Q15(mode->window[overlap-i-1], tmp);
-            out_mem[c][MAX_PERIOD+overlap-i-1] = MULT16_32_Q15(mode->window[i], tmp);
-         }
-      } while (++c<C);
-   }
-
-   deemphasis(out_syn, pcm, N, C, st->downsample, mode->preemph, st->preemph_memD);
-
-   st->loss_count++;
-
-   RESTORE_STACK;
-}
-
-int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec)
-{
-   int c, i, N;
-   int spread_decision;
-   opus_int32 bits;
-   ec_dec _dec;
-   VARDECL(celt_sig, freq);
-   VARDECL(celt_norm, X);
-   VARDECL(celt_ener, bandE);
-   VARDECL(int, fine_quant);
-   VARDECL(int, pulses);
-   VARDECL(int, cap);
-   VARDECL(int, offsets);
-   VARDECL(int, fine_priority);
-   VARDECL(int, tf_res);
-   VARDECL(unsigned char, collapse_masks);
-   celt_sig *out_mem[2];
-   celt_sig *decode_mem[2];
-   celt_sig *overlap_mem[2];
-   celt_sig *out_syn[2];
-   opus_val16 *lpc;
-   opus_val16 *oldBandE, *oldLogE, *oldLogE2, *backgroundLogE;
-
-   int shortBlocks;
-   int isTransient;
-   int intra_ener;
-   const int CC = st->channels;
-   int LM, M;
-   int effEnd;
-   int codedBands;
-   int alloc_trim;
-   int postfilter_pitch;
-   opus_val16 postfilter_gain;
-   int intensity=0;
-   int dual_stereo=0;
-   opus_int32 total_bits;
-   opus_int32 balance;
-   opus_int32 tell;
-   int dynalloc_logp;
-   int postfilter_tapset;
-   int anti_collapse_rsv;
-   int anti_collapse_on=0;
-   int silence;
-   int C = st->stream_channels;
-   ALLOC_STACK;
-
-   frame_size *= st->downsample;
-
-   c=0; do {
-      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+st->overlap);
-      out_mem[c] = decode_mem[c]+DECODE_BUFFER_SIZE-MAX_PERIOD;
-      overlap_mem[c] = decode_mem[c]+DECODE_BUFFER_SIZE;
-   } while (++c<CC);
-   lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*CC);
-   oldBandE = lpc+CC*LPC_ORDER;
-   oldLogE = oldBandE + 2*st->mode->nbEBands;
-   oldLogE2 = oldLogE + 2*st->mode->nbEBands;
-   backgroundLogE = oldLogE2  + 2*st->mode->nbEBands;
-
-#ifdef CUSTOM_MODES
-   if (st->signalling && data!=NULL)
-   {
-      int data0=data[0];
-      /* Convert "standard mode" to Opus header */
-      if (st->mode->Fs==48000 && st->mode->shortMdctSize==120)
-      {
-         data0 = fromOpus(data0);
-         if (data0<0)
-            return OPUS_INVALID_PACKET;
-      }
-      st->end = IMAX(1, st->mode->effEBands-2*(data0>>5));
-      LM = (data0>>3)&0x3;
-      C = 1 + ((data0>>2)&0x1);
-      data++;
-      len--;
-      if (LM>st->mode->maxLM)
-         return OPUS_INVALID_PACKET;
-      if (frame_size < st->mode->shortMdctSize<<LM)
-         return OPUS_BUFFER_TOO_SMALL;
-      else
-         frame_size = st->mode->shortMdctSize<<LM;
-   } else {
-#else
-   {
-#endif
-      for (LM=0;LM<=st->mode->maxLM;LM++)
-         if (st->mode->shortMdctSize<<LM==frame_size)
-            break;
-      if (LM>st->mode->maxLM)
-         return OPUS_BAD_ARG;
-   }
-   M=1<<LM;
-
-   if (len<0 || len>1275 || pcm==NULL)
-      return OPUS_BAD_ARG;
-
-   N = M*st->mode->shortMdctSize;
-
-   effEnd = st->end;
-   if (effEnd > st->mode->effEBands)
-      effEnd = st->mode->effEBands;
-
-   if (data == NULL || len<=1)
-   {
-      celt_decode_lost(st, pcm, N, LM);
-      RESTORE_STACK;
-      return frame_size/st->downsample;
-   }
-
-   ALLOC(freq, IMAX(CC,C)*N, celt_sig); /**< Interleaved signal MDCTs */
-   ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
-   ALLOC(bandE, st->mode->nbEBands*C, celt_ener);
-   c=0; do
-      for (i=0;i<M*st->mode->eBands[st->start];i++)
-         X[c*N+i] = 0;
-   while (++c<C);
-   c=0; do
-      for (i=M*st->mode->eBands[effEnd];i<N;i++)
-         X[c*N+i] = 0;
-   while (++c<C);
-
-   if (dec == NULL)
-   {
-      ec_dec_init(&_dec,(unsigned char*)data,len);
-      dec = &_dec;
-   }
-
-   if (C==1)
-   {
-      for (i=0;i<st->mode->nbEBands;i++)
-         oldBandE[i]=MAX16(oldBandE[i],oldBandE[st->mode->nbEBands+i]);
-   }
-
-   total_bits = len*8;
-   tell = ec_tell(dec);
-
-   if (tell >= total_bits)
-      silence = 1;
-   else if (tell==1)
-      silence = ec_dec_bit_logp(dec, 15);
-   else
-      silence = 0;
-   if (silence)
-   {
-      /* Pretend we've read all the remaining bits */
-      tell = len*8;
-      dec->nbits_total+=tell-ec_tell(dec);
-   }
-
-   postfilter_gain = 0;
-   postfilter_pitch = 0;
-   postfilter_tapset = 0;
-   if (st->start==0 && tell+16 <= total_bits)
-   {
-      if(ec_dec_bit_logp(dec, 1))
-      {
-         int qg, octave;
-         octave = ec_dec_uint(dec, 6);
-         postfilter_pitch = (16<<octave)+ec_dec_bits(dec, 4+octave)-1;
-         qg = ec_dec_bits(dec, 3);
-         if (ec_tell(dec)+2<=total_bits)
-            postfilter_tapset = ec_dec_icdf(dec, tapset_icdf, 2);
-         postfilter_gain = QCONST16(.09375f,15)*(qg+1);
-      }
-      tell = ec_tell(dec);
-   }
-
-   if (LM > 0 && tell+3 <= total_bits)
-   {
-      isTransient = ec_dec_bit_logp(dec, 3);
-      tell = ec_tell(dec);
-   }
-   else
-      isTransient = 0;
-
-   if (isTransient)
-      shortBlocks = M;
-   else
-      shortBlocks = 0;
-
-   /* Decode the global flags (first symbols in the stream) */
-   intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0;
-   /* Get band energies */
-   unquant_coarse_energy(st->mode, st->start, st->end, oldBandE,
-         intra_ener, dec, C, LM);
-
-   ALLOC(tf_res, st->mode->nbEBands, int);
-   tf_decode(st->start, st->end, isTransient, tf_res, LM, dec);
-
-   tell = ec_tell(dec);
-   spread_decision = SPREAD_NORMAL;
-   if (tell+4 <= total_bits)
-      spread_decision = ec_dec_icdf(dec, spread_icdf, 5);
-
-   ALLOC(pulses, st->mode->nbEBands, int);
-   ALLOC(cap, st->mode->nbEBands, int);
-   ALLOC(offsets, st->mode->nbEBands, int);
-   ALLOC(fine_priority, st->mode->nbEBands, int);
-
-   init_caps(st->mode,cap,LM,C);
-
-   dynalloc_logp = 6;
-   total_bits<<=BITRES;
-   tell = ec_tell_frac(dec);
-   for (i=st->start;i<st->end;i++)
-   {
-      int width, quanta;
-      int dynalloc_loop_logp;
-      int boost;
-      width = C*(st->mode->eBands[i+1]-st->mode->eBands[i])<<LM;
-      /* quanta is 6 bits, but no more than 1 bit/sample
-         and no less than 1/8 bit/sample */
-      quanta = IMIN(width<<BITRES, IMAX(6<<BITRES, width));
-      dynalloc_loop_logp = dynalloc_logp;
-      boost = 0;
-      while (tell+(dynalloc_loop_logp<<BITRES) < total_bits && boost < cap[i])
-      {
-         int flag;
-         flag = ec_dec_bit_logp(dec, dynalloc_loop_logp);
-         tell = ec_tell_frac(dec);
-         if (!flag)
-            break;
-         boost += quanta;
-         total_bits -= quanta;
-         dynalloc_loop_logp = 1;
-      }
-      offsets[i] = boost;
-      /* Making dynalloc more likely */
-      if (boost>0)
-         dynalloc_logp = IMAX(2, dynalloc_logp-1);
-   }
-
-   ALLOC(fine_quant, st->mode->nbEBands, int);
-   alloc_trim = tell+(6<<BITRES) <= total_bits ?
-         ec_dec_icdf(dec, trim_icdf, 7) : 5;
-
-   bits = (((opus_int32)len*8)<<BITRES) - ec_tell_frac(dec) - 1;
-   anti_collapse_rsv = isTransient&&LM>=2&&bits>=((LM+2)<<BITRES) ? (1<<BITRES) : 0;
-   bits -= anti_collapse_rsv;
-   codedBands = compute_allocation(st->mode, st->start, st->end, offsets, cap,
-         alloc_trim, &intensity, &dual_stereo, bits, &balance, pulses,
-         fine_quant, fine_priority, C, LM, dec, 0, 0);
-
-   unquant_fine_energy(st->mode, st->start, st->end, oldBandE, fine_quant, dec, C);
-
-   /* Decode fixed codebook */
-   ALLOC(collapse_masks, C*st->mode->nbEBands, unsigned char);
-   quant_all_bands(0, st->mode, st->start, st->end, X, C==2 ? X+N : NULL, collapse_masks,
-         NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
-         len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng);
-
-   if (anti_collapse_rsv > 0)
-   {
-      anti_collapse_on = ec_dec_bits(dec, 1);
-   }
-
-   unquant_energy_finalise(st->mode, st->start, st->end, oldBandE,
-         fine_quant, fine_priority, len*8-ec_tell(dec), dec, C);
-
-   if (anti_collapse_on)
-      anti_collapse(st->mode, X, collapse_masks, LM, C, N,
-            st->start, st->end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
-
-   log2Amp(st->mode, st->start, st->end, bandE, oldBandE, C);
-
-   if (silence)
-   {
-      for (i=0;i<C*st->mode->nbEBands;i++)
-      {
-         bandE[i] = 0;
-         oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
-      }
-   }
-   /* Synthesis */
-   denormalise_bands(st->mode, X, freq, bandE, effEnd, C, M);
-
-   OPUS_MOVE(decode_mem[0], decode_mem[0]+N, DECODE_BUFFER_SIZE-N);
-   if (CC==2)
-      OPUS_MOVE(decode_mem[1], decode_mem[1]+N, DECODE_BUFFER_SIZE-N);
-
-   c=0; do
-      for (i=0;i<M*st->mode->eBands[st->start];i++)
-         freq[c*N+i] = 0;
-   while (++c<C);
-   c=0; do {
-      int bound = M*st->mode->eBands[effEnd];
-      if (st->downsample!=1)
-         bound = IMIN(bound, N/st->downsample);
-      for (i=bound;i<N;i++)
-         freq[c*N+i] = 0;
-   } while (++c<C);
-
-   out_syn[0] = out_mem[0]+MAX_PERIOD-N;
-   if (CC==2)
-      out_syn[1] = out_mem[1]+MAX_PERIOD-N;
-
-   if (CC==2&&C==1)
-   {
-      for (i=0;i<N;i++)
-         freq[N+i] = freq[i];
-   }
-   if (CC==1&&C==2)
-   {
-      for (i=0;i<N;i++)
-         freq[i] = HALF32(ADD32(freq[i],freq[N+i]));
-   }
-
-   /* Compute inverse MDCTs */
-   compute_inv_mdcts(st->mode, shortBlocks, freq, out_syn, overlap_mem, CC, LM);
-
-   c=0; do {
-      st->postfilter_period=IMAX(st->postfilter_period, COMBFILTER_MINPERIOD);
-      st->postfilter_period_old=IMAX(st->postfilter_period_old, COMBFILTER_MINPERIOD);
-      comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old, st->postfilter_period, st->mode->shortMdctSize,
-            st->postfilter_gain_old, st->postfilter_gain, st->postfilter_tapset_old, st->postfilter_tapset,
-            st->mode->window, st->overlap);
-      if (LM!=0)
-         comb_filter(out_syn[c]+st->mode->shortMdctSize, out_syn[c]+st->mode->shortMdctSize, st->postfilter_period, postfilter_pitch, N-st->mode->shortMdctSize,
-               st->postfilter_gain, postfilter_gain, st->postfilter_tapset, postfilter_tapset,
-               st->mode->window, st->mode->overlap);
-
-   } while (++c<CC);
-   st->postfilter_period_old = st->postfilter_period;
-   st->postfilter_gain_old = st->postfilter_gain;
-   st->postfilter_tapset_old = st->postfilter_tapset;
-   st->postfilter_period = postfilter_pitch;
-   st->postfilter_gain = postfilter_gain;
-   st->postfilter_tapset = postfilter_tapset;
-   if (LM!=0)
-   {
-      st->postfilter_period_old = st->postfilter_period;
-      st->postfilter_gain_old = st->postfilter_gain;
-      st->postfilter_tapset_old = st->postfilter_tapset;
-   }
-
-   if (C==1) {
-      for (i=0;i<st->mode->nbEBands;i++)
-         oldBandE[st->mode->nbEBands+i]=oldBandE[i];
-   }
-
-   /* In case start or end were to change */
-   if (!isTransient)
-   {
-      for (i=0;i<2*st->mode->nbEBands;i++)
-         oldLogE2[i] = oldLogE[i];
-      for (i=0;i<2*st->mode->nbEBands;i++)
-         oldLogE[i] = oldBandE[i];
-      for (i=0;i<2*st->mode->nbEBands;i++)
-         backgroundLogE[i] = MIN16(backgroundLogE[i] + M*QCONST16(0.001f,DB_SHIFT), oldBandE[i]);
-   } else {
-      for (i=0;i<2*st->mode->nbEBands;i++)
-         oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
-   }
-   c=0; do
-   {
-      for (i=0;i<st->start;i++)
-      {
-         oldBandE[c*st->mode->nbEBands+i]=0;
-         oldLogE[c*st->mode->nbEBands+i]=oldLogE2[c*st->mode->nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
-      }
-      for (i=st->end;i<st->mode->nbEBands;i++)
-      {
-         oldBandE[c*st->mode->nbEBands+i]=0;
-         oldLogE[c*st->mode->nbEBands+i]=oldLogE2[c*st->mode->nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
-      }
-   } while (++c<2);
-   st->rng = dec->rng;
-
-   deemphasis(out_syn, pcm, N, CC, st->downsample, st->mode->preemph, st->preemph_memD);
-   st->loss_count = 0;
-   RESTORE_STACK;
-   if (ec_tell(dec) > 8*len)
-      return OPUS_INTERNAL_ERROR;
-   if(ec_get_error(dec))
-      st->error = 1;
-   return frame_size/st->downsample;
-}
-
-
-#ifdef CUSTOM_MODES
-
-#ifdef FIXED_POINT
-int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
-{
-   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL);
-}
-
-#ifndef DISABLE_FLOAT_API
-int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size)
-{
-   int j, ret, C, N;
-   VARDECL(opus_int16, out);
-   ALLOC_STACK;
-
-   if (pcm==NULL)
-      return OPUS_BAD_ARG;
-
-   C = st->channels;
-   N = frame_size;
-
-   ALLOC(out, C*N, opus_int16);
-   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL);
-   if (ret>0)
-      for (j=0;j<C*ret;j++)
-         pcm[j]=out[j]*(1.f/32768.f);
-
-   RESTORE_STACK;
-   return ret;
-}
-#endif /* DISABLE_FLOAT_API */
-
-#else
-
-int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size)
-{
-   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL);
-}
-
-int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
-{
-   int j, ret, C, N;
-   VARDECL(celt_sig, out);
-   ALLOC_STACK;
-
-   if (pcm==NULL)
-      return OPUS_BAD_ARG;
-
-   C = st->channels;
-   N = frame_size;
-   ALLOC(out, C*N, celt_sig);
-
-   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL);
-
-   if (ret>0)
-      for (j=0;j<C*ret;j++)
-         pcm[j] = FLOAT2INT16 (out[j]);
-
-   RESTORE_STACK;
-   return ret;
-}
-
-#endif
-#endif /* CUSTOM_MODES */
-
-int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...)
-{
-   va_list ap;
-
-   va_start(ap, request);
-   switch (request)
-   {
-      case CELT_SET_START_BAND_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         if (value<0 || value>=st->mode->nbEBands)
-            goto bad_arg;
-         st->start = value;
-      }
-      break;
-      case CELT_SET_END_BAND_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         if (value<1 || value>st->mode->nbEBands)
-            goto bad_arg;
-         st->end = value;
-      }
-      break;
-      case CELT_SET_CHANNELS_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         if (value<1 || value>2)
-            goto bad_arg;
-         st->stream_channels = value;
-      }
-      break;
-      case CELT_GET_AND_CLEAR_ERROR_REQUEST:
-      {
-         opus_int32 *value = va_arg(ap, opus_int32*);
-         if (value==NULL)
-            goto bad_arg;
-         *value=st->error;
-         st->error = 0;
-      }
-      break;
-      case OPUS_GET_LOOKAHEAD_REQUEST:
-      {
-         opus_int32 *value = va_arg(ap, opus_int32*);
-         if (value==NULL)
-            goto bad_arg;
-         *value = st->overlap/st->downsample;
-      }
-      break;
-      case OPUS_RESET_STATE:
-      {
-         int i;
-         opus_val16 *lpc, *oldBandE, *oldLogE, *oldLogE2;
-         lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels);
-         oldBandE = lpc+st->channels*LPC_ORDER;
-         oldLogE = oldBandE + 2*st->mode->nbEBands;
-         oldLogE2 = oldLogE + 2*st->mode->nbEBands;
-         OPUS_CLEAR((char*)&st->DECODER_RESET_START,
-               opus_custom_decoder_get_size(st->mode, st->channels)-
-               ((char*)&st->DECODER_RESET_START - (char*)st));
-         for (i=0;i<2*st->mode->nbEBands;i++)
-            oldLogE[i]=oldLogE2[i]=-QCONST16(28.f,DB_SHIFT);
-      }
-      break;
-      case OPUS_GET_PITCH_REQUEST:
-      {
-         opus_int32 *value = va_arg(ap, opus_int32*);
-         if (value==NULL)
-            goto bad_arg;
-         *value = st->postfilter_period;
-      }
-      break;
-      case CELT_GET_MODE_REQUEST:
-      {
-         const CELTMode ** value = va_arg(ap, const CELTMode**);
-         if (value==0)
-            goto bad_arg;
-         *value=st->mode;
-      }
-      break;
-      case CELT_SET_SIGNALLING_REQUEST:
-      {
-         opus_int32 value = va_arg(ap, opus_int32);
-         st->signalling = value;
-      }
-      break;
-      case OPUS_GET_FINAL_RANGE_REQUEST:
-      {
-         opus_uint32 * value = va_arg(ap, opus_uint32 *);
-         if (value==0)
-            goto bad_arg;
-         *value=st->rng;
-      }
-      break;
-      default:
-         goto bad_request;
-   }
-   va_end(ap);
-   return OPUS_OK;
-bad_arg:
-   va_end(ap);
-   return OPUS_BAD_ARG;
-bad_request:
-      va_end(ap);
-  return OPUS_UNIMPLEMENTED;
-}
-
 
 
 const char *opus_strerror(int error)
@@ -2895,7 +212,7 @@ const char *opus_strerror(int error)
 
 const char *opus_get_version_string(void)
 {
-    return "libopus " OPUS_VERSION
+    return "libopus " PACKAGE_VERSION
 #ifdef FIXED_POINT
           "-fixed"
 #endif
diff --git a/celt/celt.h b/celt/celt.h
index 218cd88..cdb76c8 100644
--- a/celt/celt.h
+++ b/celt/celt.h
@@ -50,7 +50,19 @@ extern "C" {
 #define CELTDecoder OpusCustomDecoder
 #define CELTMode OpusCustomMode
 
-#define _celt_check_mode_ptr_ptr(ptr) ((ptr) + ((ptr) - (const CELTMode**)(ptr)))
+typedef struct {
+   int valid;
+   float tonality;
+   float tonality_slope;
+   float noisiness;
+   float activity;
+   float music_prob;
+   int        bandwidth;
+}AnalysisInfo;
+
+#define __celt_check_mode_ptr_ptr(ptr) ((ptr) + ((ptr) - (const CELTMode**)(ptr)))
+
+#define __celt_check_analysis_ptr(ptr) ((ptr) + ((ptr) - (const AnalysisInfo*)(ptr)))
 
 /* Encoder/decoder Requests */
 
@@ -81,12 +93,24 @@ extern "C" {
 
 #define CELT_GET_MODE_REQUEST    10015
 /** Get the CELTMode used by an encoder or decoder */
-#define CELT_GET_MODE(x) CELT_GET_MODE_REQUEST, _celt_check_mode_ptr_ptr(x)
+#define CELT_GET_MODE(x) CELT_GET_MODE_REQUEST, __celt_check_mode_ptr_ptr(x)
 
 #define CELT_SET_SIGNALLING_REQUEST    10016
 #define CELT_SET_SIGNALLING(x) CELT_SET_SIGNALLING_REQUEST, __opus_check_int(x)
 
+#define CELT_SET_TONALITY_REQUEST    10018
+#define CELT_SET_TONALITY(x) CELT_SET_TONALITY_REQUEST, __opus_check_int(x)
+#define CELT_SET_TONALITY_SLOPE_REQUEST    10020
+#define CELT_SET_TONALITY_SLOPE(x) CELT_SET_TONALITY_SLOPE_REQUEST, __opus_check_int(x)
+
+#define CELT_SET_ANALYSIS_REQUEST    10022
+#define CELT_SET_ANALYSIS(x) CELT_SET_ANALYSIS_REQUEST, __celt_check_analysis_ptr(x)
 
+#define OPUS_SET_LFE_REQUEST    10024
+#define OPUS_SET_LFE(x) OPUS_SET_LFE_REQUEST, __opus_check_int(x)
+
+#define OPUS_SET_ENERGY_MASK_REQUEST    10026
+#define OPUS_SET_ENERGY_MASK(x) OPUS_SET_ENERGY_MASK_REQUEST, __opus_check_val16_ptr(x)
 
 /* Encoder stuff */
 
@@ -110,6 +134,78 @@ int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned cha
 #define celt_encoder_ctl opus_custom_encoder_ctl
 #define celt_decoder_ctl opus_custom_decoder_ctl
 
+
+#ifdef CUSTOM_MODES
+#define OPUS_CUSTOM_NOSTATIC
+#else
+#define OPUS_CUSTOM_NOSTATIC static inline
+#endif
+
+static const unsigned char trim_icdf[11] = {126, 124, 119, 109, 87, 41, 19, 9, 4, 2, 0};
+/* Probs: NONE: 21.875%, LIGHT: 6.25%, NORMAL: 65.625%, AGGRESSIVE: 6.25% */
+static const unsigned char spread_icdf[4] = {25, 23, 2, 0};
+
+static const unsigned char tapset_icdf[3]={2,1,0};
+
+#ifdef CUSTOM_MODES
+static const unsigned char toOpusTable[20] = {
+      0xE0, 0xE8, 0xF0, 0xF8,
+      0xC0, 0xC8, 0xD0, 0xD8,
+      0xA0, 0xA8, 0xB0, 0xB8,
+      0x00, 0x00, 0x00, 0x00,
+      0x80, 0x88, 0x90, 0x98,
+};
+
+static const unsigned char fromOpusTable[16] = {
+      0x80, 0x88, 0x90, 0x98,
+      0x40, 0x48, 0x50, 0x58,
+      0x20, 0x28, 0x30, 0x38,
+      0x00, 0x08, 0x10, 0x18
+};
+
+static inline int toOpus(unsigned char c)
+{
+   int ret=0;
+   if (c<0xA0)
+      ret = toOpusTable[c>>3];
+   if (ret == 0)
+      return -1;
+   else
+      return ret|(c&0x7);
+}
+
+static inline int fromOpus(unsigned char c)
+{
+   if (c<0x80)
+      return -1;
+   else
+      return fromOpusTable[(c>>3)-16] | (c&0x7);
+}
+#endif /* CUSTOM_MODES */
+
+#define COMBFILTER_MAXPERIOD 1024
+#define COMBFILTER_MINPERIOD 15
+
+extern const signed char tf_select_table[4][8];
+
+int resampling_factor(opus_int32 rate);
+
+void preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
+                        int N, int CC, int upsample, const opus_val16 *coef, celt_sig *mem, int clip);
+
+void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
+      opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
+      const opus_val16 *window, int overlap);
+
+void init_caps(const CELTMode *m,int *cap,int LM,int C);
+
+#ifdef RESYNTH
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, celt_sig * OPUS_RESTRICT scratch);
+
+void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
+      celt_sig * OPUS_RESTRICT out_mem[], int C, int LM);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
new file mode 100644
index 0000000..4424b97
--- /dev/null
+++ b/celt/celt_decoder.c
@@ -0,0 +1,1195 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2010 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define CELT_DECODER_C
+
+#include "cpu_support.h"
+#include "os_support.h"
+#include "mdct.h"
+#include <math.h>
+#include "celt.h"
+#include "pitch.h"
+#include "bands.h"
+#include "modes.h"
+#include "entcode.h"
+#include "quant_bands.h"
+#include "rate.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "float_cast.h"
+#include <stdarg.h>
+#include "celt_lpc.h"
+#include "vq.h"
+
+/**********************************************************************/
+/*                                                                    */
+/*                             DECODER                                */
+/*                                                                    */
+/**********************************************************************/
+#define DECODE_BUFFER_SIZE 2048
+
+/** Decoder state
+ @brief Decoder state
+ */
+struct OpusCustomDecoder {
+   const OpusCustomMode *mode;
+   int overlap;
+   int channels;
+   int stream_channels;
+
+   int downsample;
+   int start, end;
+   int signalling;
+   int arch;
+
+   /* Everything beyond this point gets cleared on a reset */
+#define DECODER_RESET_START rng
+
+   opus_uint32 rng;
+   int error;
+   int last_pitch_index;
+   int loss_count;
+   int postfilter_period;
+   int postfilter_period_old;
+   opus_val16 postfilter_gain;
+   opus_val16 postfilter_gain_old;
+   int postfilter_tapset;
+   int postfilter_tapset_old;
+
+   celt_sig preemph_memD[2];
+
+   celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */
+   /* opus_val16 lpc[],  Size = channels*LPC_ORDER */
+   /* opus_val16 oldEBands[], Size = 2*mode->nbEBands */
+   /* opus_val16 oldLogE[], Size = 2*mode->nbEBands */
+   /* opus_val16 oldLogE2[], Size = 2*mode->nbEBands */
+   /* opus_val16 backgroundLogE[], Size = 2*mode->nbEBands */
+};
+
+int celt_decoder_get_size(int channels)
+{
+   const CELTMode *mode = opus_custom_mode_create(48000, 960, NULL);
+   return opus_custom_decoder_get_size(mode, channels);
+}
+
+OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int channels)
+{
+   int size = sizeof(struct CELTDecoder)
+            + (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig)
+            + channels*LPC_ORDER*sizeof(opus_val16)
+            + 4*2*mode->nbEBands*sizeof(opus_val16);
+   return size;
+}
+
+#ifdef CUSTOM_MODES
+CELTDecoder *opus_custom_decoder_create(const CELTMode *mode, int channels, int *error)
+{
+   int ret;
+   CELTDecoder *st = (CELTDecoder *)opus_alloc(opus_custom_decoder_get_size(mode, channels));
+   ret = opus_custom_decoder_init(st, mode, channels);
+   if (ret != OPUS_OK)
+   {
+      opus_custom_decoder_destroy(st);
+      st = NULL;
+   }
+   if (error)
+      *error = ret;
+   return st;
+}
+#endif /* CUSTOM_MODES */
+
+int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels)
+{
+   int ret;
+   ret = opus_custom_decoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels);
+   if (ret != OPUS_OK)
+      return ret;
+   st->downsample = resampling_factor(sampling_rate);
+   if (st->downsample==0)
+      return OPUS_BAD_ARG;
+   else
+      return OPUS_OK;
+}
+
+OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_init(CELTDecoder *st, const CELTMode *mode, int channels)
+{
+   if (channels < 0 || channels > 2)
+      return OPUS_BAD_ARG;
+
+   if (st==NULL)
+      return OPUS_ALLOC_FAIL;
+
+   OPUS_CLEAR((char*)st, opus_custom_decoder_get_size(mode, channels));
+
+   st->mode = mode;
+   st->overlap = mode->overlap;
+   st->stream_channels = st->channels = channels;
+
+   st->downsample = 1;
+   st->start = 0;
+   st->end = st->mode->effEBands;
+   st->signalling = 1;
+   st->arch = opus_select_arch();
+
+   st->loss_count = 0;
+
+   opus_custom_decoder_ctl(st, OPUS_RESET_STATE);
+
+   return OPUS_OK;
+}
+
+#ifdef CUSTOM_MODES
+void opus_custom_decoder_destroy(CELTDecoder *st)
+{
+   opus_free(st);
+}
+#endif /* CUSTOM_MODES */
+
+static inline opus_val16 SIG2WORD16(celt_sig x)
+{
+#ifdef FIXED_POINT
+   x = PSHR32(x, SIG_SHIFT);
+   x = MAX32(x, -32768);
+   x = MIN32(x, 32767);
+   return EXTRACT16(x);
+#else
+   return (opus_val16)x;
+#endif
+}
+
+#ifndef RESYNTH
+static
+#endif
+void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, celt_sig * OPUS_RESTRICT scratch)
+{
+   int c;
+   int Nd;
+   int apply_downsampling=0;
+   opus_val16 coef0;
+
+   coef0 = coef[0];
+   Nd = N/downsample;
+   c=0; do {
+      int j;
+      celt_sig * OPUS_RESTRICT x;
+      opus_val16  * OPUS_RESTRICT y;
+      celt_sig m = mem[c];
+      x =in[c];
+      y = pcm+c;
+#ifdef CUSTOM_MODES
+      if (coef[1] != 0)
+      {
+         opus_val16 coef1 = coef[1];
+         opus_val16 coef3 = coef[3];
+         for (j=0;j<N;j++)
+         {
+            celt_sig tmp = x[j] + m;
+            m = MULT16_32_Q15(coef0, tmp)
+                          - MULT16_32_Q15(coef1, x[j]);
+            tmp = SHL32(MULT16_32_Q15(coef3, tmp), 2);
+            scratch[j] = tmp;
+         }
+         apply_downsampling=1;
+      } else
+#endif
+      if (downsample>1)
+      {
+         /* Shortcut for the standard (non-custom modes) case */
+         for (j=0;j<N;j++)
+         {
+            celt_sig tmp = x[j] + m;
+            m = MULT16_32_Q15(coef0, tmp);
+            scratch[j] = tmp;
+         }
+         apply_downsampling=1;
+      } else {
+         /* Shortcut for the standard (non-custom modes) case */
+         for (j=0;j<N;j++)
+         {
+            celt_sig tmp = x[j] + m + VERY_SMALL;
+            m = MULT16_32_Q15(coef0, tmp);
+            y[j*C] = SCALEOUT(SIG2WORD16(tmp));
+         }
+      }
+      mem[c] = m;
+
+      if (apply_downsampling)
+      {
+         /* Perform down-sampling */
+         for (j=0;j<Nd;j++)
+            y[j*C] = SCALEOUT(SIG2WORD16(scratch[j*downsample]));
+      }
+   } while (++c<C);
+}
+
+/** Compute the IMDCT and apply window for all sub-frames and
+    all channels in a frame */
+#ifndef RESYNTH
+static
+#endif
+void compute_inv_mdcts(const CELTMode *mode, int shortBlocks, celt_sig *X,
+      celt_sig * OPUS_RESTRICT out_mem[], int C, int LM)
+{
+   int b, c;
+   int B;
+   int N;
+   int shift;
+   const int overlap = OVERLAP(mode);
+
+   if (shortBlocks)
+   {
+      B = shortBlocks;
+      N = mode->shortMdctSize;
+      shift = mode->maxLM;
+   } else {
+      B = 1;
+      N = mode->shortMdctSize<<LM;
+      shift = mode->maxLM-LM;
+   }
+   c=0; do {
+      /* IMDCT on the interleaved the sub-frames, overlap-add is performed by the IMDCT */
+      for (b=0;b<B;b++)
+         clt_mdct_backward(&mode->mdct, &X[b+c*N*B], out_mem[c]+N*b, mode->window, overlap, shift, B);
+   } while (++c<C);
+}
+
+static void tf_decode(int start, int end, int isTransient, int *tf_res, int LM, ec_dec *dec)
+{
+   int i, curr, tf_select;
+   int tf_select_rsv;
+   int tf_changed;
+   int logp;
+   opus_uint32 budget;
+   opus_uint32 tell;
+
+   budget = dec->storage*8;
+   tell = ec_tell(dec);
+   logp = isTransient ? 2 : 4;
+   tf_select_rsv = LM>0 && tell+logp+1<=budget;
+   budget -= tf_select_rsv;
+   tf_changed = curr = 0;
+   for (i=start;i<end;i++)
+   {
+      if (tell+logp<=budget)
+      {
+         curr ^= ec_dec_bit_logp(dec, logp);
+         tell = ec_tell(dec);
+         tf_changed |= curr;
+      }
+      tf_res[i] = curr;
+      logp = isTransient ? 4 : 5;
+   }
+   tf_select = 0;
+   if (tf_select_rsv &&
+     tf_select_table[LM][4*isTransient+0+tf_changed] !=
+     tf_select_table[LM][4*isTransient+2+tf_changed])
+   {
+      tf_select = ec_dec_bit_logp(dec, 1);
+   }
+   for (i=start;i<end;i++)
+   {
+      tf_res[i] = tf_select_table[LM][4*isTransient+2*tf_select+tf_res[i]];
+   }
+}
+
+/* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save
+   CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The
+   current value corresponds to a pitch of 66.67 Hz. */
+#define PLC_PITCH_LAG_MAX (720)
+/* The minimum pitch lag to allow in the pitch-based PLC. This corresponds to a
+   pitch of 480 Hz. */
+#define PLC_PITCH_LAG_MIN (100)
+
+static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_RESTRICT pcm, int N, int LM)
+{
+   int c;
+   int i;
+   const int C = st->channels;
+   celt_sig *decode_mem[2];
+   celt_sig *out_syn[2];
+   opus_val16 *lpc;
+   opus_val16 *oldBandE, *oldLogE, *oldLogE2, *backgroundLogE;
+   const OpusCustomMode *mode;
+   int nbEBands;
+   int overlap;
+   int start;
+   int downsample;
+   int loss_count;
+   int noise_based;
+   const opus_int16 *eBands;
+   VARDECL(celt_sig, scratch);
+   SAVE_STACK;
+
+   mode = st->mode;
+   nbEBands = mode->nbEBands;
+   overlap = mode->overlap;
+   eBands = mode->eBands;
+
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+      out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
+   } while (++c<C);
+   lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*C);
+   oldBandE = lpc+C*LPC_ORDER;
+   oldLogE = oldBandE + 2*nbEBands;
+   oldLogE2 = oldLogE + 2*nbEBands;
+   backgroundLogE = oldLogE2  + 2*nbEBands;
+
+   loss_count = st->loss_count;
+   start = st->start;
+   downsample = st->downsample;
+   noise_based = loss_count >= 5 || start != 0;
+   ALLOC(scratch, noise_based?N*C:N, celt_sig);
+   if (noise_based)
+   {
+      /* Noise-based PLC/CNG */
+      celt_sig *freq;
+      VARDECL(celt_norm, X);
+      opus_uint32 seed;
+      opus_val16 *plcLogE;
+      int end;
+      int effEnd;
+
+      end = st->end;
+      effEnd = IMAX(start, IMIN(end, mode->effEBands));
+
+      /* Share the interleaved signal MDCT coefficient buffer with the
+         deemphasis scratch buffer. */
+      freq = scratch;
+      ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+
+      if (loss_count >= 5)
+         plcLogE = backgroundLogE;
+      else {
+         /* Energy decay */
+         opus_val16 decay = loss_count==0 ?
+               QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
+         c=0; do
+         {
+            for (i=start;i<end;i++)
+               oldBandE[c*nbEBands+i] -= decay;
+         } while (++c<C);
+         plcLogE = oldBandE;
+      }
+      seed = st->rng;
+      for (c=0;c<C;c++)
+      {
+         for (i=start;i<effEnd;i++)
+         {
+            int j;
+            int boffs;
+            int blen;
+            boffs = N*c+(eBands[i]<<LM);
+            blen = (eBands[i+1]-eBands[i])<<LM;
+            for (j=0;j<blen;j++)
+            {
+               seed = celt_lcg_rand(seed);
+               X[boffs+j] = (celt_norm)((opus_int32)seed>>20);
+            }
+            renormalise_vector(X+boffs, blen, Q15ONE);
+         }
+      }
+      st->rng = seed;
+
+      denormalise_bands(mode, X, freq, plcLogE, start, effEnd, C, 1<<LM);
+
+      c=0; do {
+         int bound = eBands[effEnd]<<LM;
+         if (downsample!=1)
+            bound = IMIN(bound, N/downsample);
+         for (i=bound;i<N;i++)
+            freq[c*N+i] = 0;
+      } while (++c<C);
+      c=0; do {
+         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
+               DECODE_BUFFER_SIZE-N+(overlap>>1));
+      } while (++c<C);
+      compute_inv_mdcts(mode, 0, freq, out_syn, C, LM);
+   } else {
+      /* Pitch-based PLC */
+      const opus_val16 *window;
+      opus_val16 fade = Q15ONE;
+      int pitch_index;
+      VARDECL(opus_val32, etmp);
+      VARDECL(opus_val16, exc);
+
+      if (loss_count == 0)
+      {
+         VARDECL( opus_val16, lp_pitch_buf );
+         ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 );
+         pitch_downsample(decode_mem, lp_pitch_buf, DECODE_BUFFER_SIZE, C);
+         pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf,
+               DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX,
+               PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index);
+         pitch_index = PLC_PITCH_LAG_MAX-pitch_index;
+         st->last_pitch_index = pitch_index;
+      } else {
+         pitch_index = st->last_pitch_index;
+         fade = QCONST16(.8f,15);
+      }
+
+      ALLOC(etmp, overlap, opus_val32);
+      ALLOC(exc, MAX_PERIOD, opus_val16);
+      window = mode->window;
+      c=0; do {
+         opus_val16 decay;
+         opus_val16 attenuation;
+         opus_val32 S1=0;
+         celt_sig *buf;
+         int extrapolation_offset;
+         int extrapolation_len;
+         int exc_length;
+         int j;
+
+         buf = decode_mem[c];
+         for (i=0;i<MAX_PERIOD;i++) {
+            exc[i] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD+i], SIG_SHIFT);
+         }
+
+         if (loss_count == 0)
+         {
+            opus_val32 ac[LPC_ORDER+1];
+            /* Compute LPC coefficients for the last MAX_PERIOD samples before
+               the first loss so we can work in the excitation-filter domain. */
+            _celt_autocorr(exc, ac, window, overlap, LPC_ORDER, MAX_PERIOD);
+            /* Add a noise floor of -40 dB. */
+#ifdef FIXED_POINT
+            ac[0] += SHR32(ac[0],13);
+#else
+            ac[0] *= 1.0001f;
+#endif
+            /* Use lag windowing to stabilize the Levinson-Durbin recursion. */
+            for (i=1;i<=LPC_ORDER;i++)
+            {
+               /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
+#ifdef FIXED_POINT
+               ac[i] -= MULT16_32_Q15(2*i*i, ac[i]);
+#else
+               ac[i] -= ac[i]*(0.008f*0.008f)*i*i;
+#endif
+            }
+            _celt_lpc(lpc+c*LPC_ORDER, ac, LPC_ORDER);
+         }
+         /* We want the excitation for 2 pitch periods in order to look for a
+            decaying signal, but we can't get more than MAX_PERIOD. */
+         exc_length = IMIN(2*pitch_index, MAX_PERIOD);
+         /* Initialize the LPC history with the samples just before the start
+            of the region for which we're computing the excitation. */
+         {
+            opus_val16 lpc_mem[LPC_ORDER];
+            for (i=0;i<LPC_ORDER;i++)
+            {
+               lpc_mem[i] =
+                     ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-1-i], SIG_SHIFT);
+            }
+            /* Compute the excitation for exc_length samples before the loss. */
+            celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
+                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem);
+         }
+
+         /* Check if the waveform is decaying, and if so how fast.
+            We do this to avoid adding energy when concealing in a segment
+            with decaying energy. */
+         {
+            opus_val32 E1=1, E2=1;
+            int decay_length;
+#ifdef FIXED_POINT
+            int shift = IMAX(0,2*celt_zlog2(celt_maxabs16(&exc[MAX_PERIOD-exc_length], exc_length))-20);
+#endif
+            decay_length = exc_length>>1;
+            for (i=0;i<decay_length;i++)
+            {
+               opus_val16 e;
+               e = exc[MAX_PERIOD-decay_length+i];
+               E1 += SHR32(MULT16_16(e, e), shift);
+               e = exc[MAX_PERIOD-2*decay_length+i];
+               E2 += SHR32(MULT16_16(e, e), shift);
+            }
+            E1 = MIN32(E1, E2);
+            decay = celt_sqrt(frac_div32(SHR32(E1, 1), E2));
+         }
+
+         /* Move the decoder memory one frame to the left to give us room to
+            add the data for the new frame. We ignore the overlap that extends
+            past the end of the buffer, because we aren't going to use it. */
+         OPUS_MOVE(buf, buf+N, DECODE_BUFFER_SIZE-N);
+
+         /* Extrapolate from the end of the excitation with a period of
+            "pitch_index", scaling down each period by an additional factor of
+            "decay". */
+         extrapolation_offset = MAX_PERIOD-pitch_index;
+         /* We need to extrapolate enough samples to cover a complete MDCT
+            window (including overlap/2 samples on both sides). */
+         extrapolation_len = N+overlap;
+         /* We also apply fading if this is not the first loss. */
+         attenuation = MULT16_16_Q15(fade, decay);
+         for (i=j=0;i<extrapolation_len;i++,j++)
+         {
+            opus_val16 tmp;
+            if (j >= pitch_index) {
+               j -= pitch_index;
+               attenuation = MULT16_16_Q15(attenuation, decay);
+            }
+            buf[DECODE_BUFFER_SIZE-N+i] =
+                  SHL32(EXTEND32(MULT16_16_Q15(attenuation,
+                        exc[extrapolation_offset+j])), SIG_SHIFT);
+            /* Compute the energy of the previously decoded signal whose
+               excitation we're copying. */
+            tmp = ROUND16(
+                  buf[DECODE_BUFFER_SIZE-MAX_PERIOD-N+extrapolation_offset+j],
+                  SIG_SHIFT);
+            S1 += SHR32(MULT16_16(tmp, tmp), 8);
+         }
+
+         {
+            opus_val16 lpc_mem[LPC_ORDER];
+            /* Copy the last decoded samples (prior to the overlap region) to
+               synthesis filter memory so we can have a continuous signal. */
+            for (i=0;i<LPC_ORDER;i++)
+               lpc_mem[i] = ROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
+            /* Apply the synthesis filter to convert the excitation back into
+               the signal domain. */
+            celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
+                  buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER,
+                  lpc_mem);
+         }
+
+         /* Check if the synthesis energy is higher than expected, which can
+            happen with the signal changes during our window. If so,
+            attenuate. */
+         {
+            opus_val32 S2=0;
+            for (i=0;i<extrapolation_len;i++)
+            {
+               opus_val16 tmp = ROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
+               S2 += SHR32(MULT16_16(tmp, tmp), 8);
+            }
+            /* This checks for an "explosion" in the synthesis. */
+#ifdef FIXED_POINT
+            if (!(S1 > SHR32(S2,2)))
+#else
+            /* The float test is written this way to catch NaNs in the output
+               of the IIR filter at the same time. */
+            if (!(S1 > 0.2f*S2))
+#endif
+            {
+               for (i=0;i<extrapolation_len;i++)
+                  buf[DECODE_BUFFER_SIZE-N+i] = 0;
+            } else if (S1 < S2)
+            {
+               opus_val16 ratio = celt_sqrt(frac_div32(SHR32(S1,1)+1,S2+1));
+               for (i=0;i<overlap;i++)
+               {
+                  opus_val16 tmp_g = Q15ONE
+                        - MULT16_16_Q15(window[i], Q15ONE-ratio);
+                  buf[DECODE_BUFFER_SIZE-N+i] =
+                        MULT16_32_Q15(tmp_g, buf[DECODE_BUFFER_SIZE-N+i]);
+               }
+               for (i=overlap;i<extrapolation_len;i++)
+               {
+                  buf[DECODE_BUFFER_SIZE-N+i] =
+                        MULT16_32_Q15(ratio, buf[DECODE_BUFFER_SIZE-N+i]);
+               }
+            }
+         }
+
+         /* Apply the pre-filter to the MDCT overlap for the next frame because
+            the post-filter will be re-applied in the decoder after the MDCT
+            overlap. */
+         comb_filter(etmp, buf+DECODE_BUFFER_SIZE,
+              st->postfilter_period, st->postfilter_period, overlap,
+              -st->postfilter_gain, -st->postfilter_gain,
+              st->postfilter_tapset, st->postfilter_tapset, NULL, 0);
+
+         /* Simulate TDAC on the concealed audio so that it blends with the
+            MDCT of the next frame. */
+         for (i=0;i<overlap/2;i++)
+         {
+            buf[DECODE_BUFFER_SIZE+i] =
+               MULT16_32_Q15(window[i], etmp[overlap-1-i])
+               + MULT16_32_Q15(window[overlap-i-1], etmp[i]);
+         }
+      } while (++c<C);
+   }
+
+   deemphasis(out_syn, pcm, N, C, downsample,
+         mode->preemph, st->preemph_memD, scratch);
+
+   st->loss_count = loss_count+1;
+
+   RESTORE_STACK;
+}
+
+int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_val16 * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec)
+{
+   int c, i, N;
+   int spread_decision;
+   opus_int32 bits;
+   ec_dec _dec;
+   VARDECL(celt_sig, freq);
+   VARDECL(celt_norm, X);
+   VARDECL(int, fine_quant);
+   VARDECL(int, pulses);
+   VARDECL(int, cap);
+   VARDECL(int, offsets);
+   VARDECL(int, fine_priority);
+   VARDECL(int, tf_res);
+   VARDECL(unsigned char, collapse_masks);
+   celt_sig *out_mem[2];
+   celt_sig *decode_mem[2];
+   celt_sig *out_syn[2];
+   opus_val16 *lpc;
+   opus_val16 *oldBandE, *oldLogE, *oldLogE2, *backgroundLogE;
+
+   int shortBlocks;
+   int isTransient;
+   int intra_ener;
+   const int CC = st->channels;
+   int LM, M;
+   int effEnd;
+   int codedBands;
+   int alloc_trim;
+   int postfilter_pitch;
+   opus_val16 postfilter_gain;
+   int intensity=0;
+   int dual_stereo=0;
+   opus_int32 total_bits;
+   opus_int32 balance;
+   opus_int32 tell;
+   int dynalloc_logp;
+   int postfilter_tapset;
+   int anti_collapse_rsv;
+   int anti_collapse_on=0;
+   int silence;
+   int C = st->stream_channels;
+   const OpusCustomMode *mode;
+   int nbEBands;
+   int overlap;
+   const opus_int16 *eBands;
+   ALLOC_STACK;
+
+   mode = st->mode;
+   nbEBands = mode->nbEBands;
+   overlap = mode->overlap;
+   eBands = mode->eBands;
+   frame_size *= st->downsample;
+
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+      out_mem[c] = decode_mem[c]+DECODE_BUFFER_SIZE-MAX_PERIOD;
+   } while (++c<CC);
+   lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC);
+   oldBandE = lpc+CC*LPC_ORDER;
+   oldLogE = oldBandE + 2*nbEBands;
+   oldLogE2 = oldLogE + 2*nbEBands;
+   backgroundLogE = oldLogE2  + 2*nbEBands;
+
+#ifdef CUSTOM_MODES
+   if (st->signalling && data!=NULL)
+   {
+      int data0=data[0];
+      /* Convert "standard mode" to Opus header */
+      if (mode->Fs==48000 && mode->shortMdctSize==120)
+      {
+         data0 = fromOpus(data0);
+         if (data0<0)
+            return OPUS_INVALID_PACKET;
+      }
+      st->end = IMAX(1, mode->effEBands-2*(data0>>5));
+      LM = (data0>>3)&0x3;
+      C = 1 + ((data0>>2)&0x1);
+      data++;
+      len--;
+      if (LM>mode->maxLM)
+         return OPUS_INVALID_PACKET;
+      if (frame_size < mode->shortMdctSize<<LM)
+         return OPUS_BUFFER_TOO_SMALL;
+      else
+         frame_size = mode->shortMdctSize<<LM;
+   } else {
+#else
+   {
+#endif
+      for (LM=0;LM<=mode->maxLM;LM++)
+         if (mode->shortMdctSize<<LM==frame_size)
+            break;
+      if (LM>mode->maxLM)
+         return OPUS_BAD_ARG;
+   }
+   M=1<<LM;
+
+   if (len<0 || len>1275 || pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   N = M*mode->shortMdctSize;
+
+   effEnd = st->end;
+   if (effEnd > mode->effEBands)
+      effEnd = mode->effEBands;
+
+   if (data == NULL || len<=1)
+   {
+      celt_decode_lost(st, pcm, N, LM);
+      RESTORE_STACK;
+      return frame_size/st->downsample;
+   }
+
+   if (dec == NULL)
+   {
+      ec_dec_init(&_dec,(unsigned char*)data,len);
+      dec = &_dec;
+   }
+
+   if (C==1)
+   {
+      for (i=0;i<nbEBands;i++)
+         oldBandE[i]=MAX16(oldBandE[i],oldBandE[nbEBands+i]);
+   }
+
+   total_bits = len*8;
+   tell = ec_tell(dec);
+
+   if (tell >= total_bits)
+      silence = 1;
+   else if (tell==1)
+      silence = ec_dec_bit_logp(dec, 15);
+   else
+      silence = 0;
+   if (silence)
+   {
+      /* Pretend we've read all the remaining bits */
+      tell = len*8;
+      dec->nbits_total+=tell-ec_tell(dec);
+   }
+
+   postfilter_gain = 0;
+   postfilter_pitch = 0;
+   postfilter_tapset = 0;
+   if (st->start==0 && tell+16 <= total_bits)
+   {
+      if(ec_dec_bit_logp(dec, 1))
+      {
+         int qg, octave;
+         octave = ec_dec_uint(dec, 6);
+         postfilter_pitch = (16<<octave)+ec_dec_bits(dec, 4+octave)-1;
+         qg = ec_dec_bits(dec, 3);
+         if (ec_tell(dec)+2<=total_bits)
+            postfilter_tapset = ec_dec_icdf(dec, tapset_icdf, 2);
+         postfilter_gain = QCONST16(.09375f,15)*(qg+1);
+      }
+      tell = ec_tell(dec);
+   }
+
+   if (LM > 0 && tell+3 <= total_bits)
+   {
+      isTransient = ec_dec_bit_logp(dec, 3);
+      tell = ec_tell(dec);
+   }
+   else
+      isTransient = 0;
+
+   if (isTransient)
+      shortBlocks = M;
+   else
+      shortBlocks = 0;
+
+   /* Decode the global flags (first symbols in the stream) */
+   intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0;
+   /* Get band energies */
+   unquant_coarse_energy(mode, st->start, st->end, oldBandE,
+         intra_ener, dec, C, LM);
+
+   ALLOC(tf_res, nbEBands, int);
+   tf_decode(st->start, st->end, isTransient, tf_res, LM, dec);
+
+   tell = ec_tell(dec);
+   spread_decision = SPREAD_NORMAL;
+   if (tell+4 <= total_bits)
+      spread_decision = ec_dec_icdf(dec, spread_icdf, 5);
+
+   ALLOC(cap, nbEBands, int);
+
+   init_caps(mode,cap,LM,C);
+
+   ALLOC(offsets, nbEBands, int);
+
+   dynalloc_logp = 6;
+   total_bits<<=BITRES;
+   tell = ec_tell_frac(dec);
+   for (i=st->start;i<st->end;i++)
+   {
+      int width, quanta;
+      int dynalloc_loop_logp;
+      int boost;
+      width = C*(eBands[i+1]-eBands[i])<<LM;
+      /* quanta is 6 bits, but no more than 1 bit/sample
+         and no less than 1/8 bit/sample */
+      quanta = IMIN(width<<BITRES, IMAX(6<<BITRES, width));
+      dynalloc_loop_logp = dynalloc_logp;
+      boost = 0;
+      while (tell+(dynalloc_loop_logp<<BITRES) < total_bits && boost < cap[i])
+      {
+         int flag;
+         flag = ec_dec_bit_logp(dec, dynalloc_loop_logp);
+         tell = ec_tell_frac(dec);
+         if (!flag)
+            break;
+         boost += quanta;
+         total_bits -= quanta;
+         dynalloc_loop_logp = 1;
+      }
+      offsets[i] = boost;
+      /* Making dynalloc more likely */
+      if (boost>0)
+         dynalloc_logp = IMAX(2, dynalloc_logp-1);
+   }
+
+   ALLOC(fine_quant, nbEBands, int);
+   alloc_trim = tell+(6<<BITRES) <= total_bits ?
+         ec_dec_icdf(dec, trim_icdf, 7) : 5;
+
+   bits = (((opus_int32)len*8)<<BITRES) - ec_tell_frac(dec) - 1;
+   anti_collapse_rsv = isTransient&&LM>=2&&bits>=((LM+2)<<BITRES) ? (1<<BITRES) : 0;
+   bits -= anti_collapse_rsv;
+
+   ALLOC(pulses, nbEBands, int);
+   ALLOC(fine_priority, nbEBands, int);
+
+   codedBands = compute_allocation(mode, st->start, st->end, offsets, cap,
+         alloc_trim, &intensity, &dual_stereo, bits, &balance, pulses,
+         fine_quant, fine_priority, C, LM, dec, 0, 0, 0);
+
+   unquant_fine_energy(mode, st->start, st->end, oldBandE, fine_quant, dec, C);
+
+   /* Decode fixed codebook */
+   ALLOC(collapse_masks, C*nbEBands, unsigned char);
+   ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+
+   quant_all_bands(0, mode, st->start, st->end, X, C==2 ? X+N : NULL, collapse_masks,
+         NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
+         len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng);
+
+   if (anti_collapse_rsv > 0)
+   {
+      anti_collapse_on = ec_dec_bits(dec, 1);
+   }
+
+   unquant_energy_finalise(mode, st->start, st->end, oldBandE,
+         fine_quant, fine_priority, len*8-ec_tell(dec), dec, C);
+
+   if (anti_collapse_on)
+      anti_collapse(mode, X, collapse_masks, LM, C, N,
+            st->start, st->end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
+
+   ALLOC(freq, IMAX(CC,C)*N, celt_sig); /**< Interleaved signal MDCTs */
+
+   if (silence)
+   {
+      for (i=0;i<C*nbEBands;i++)
+         oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
+      for (i=0;i<C*N;i++)
+         freq[i] = 0;
+   } else {
+      /* Synthesis */
+      denormalise_bands(mode, X, freq, oldBandE, st->start, effEnd, C, M);
+   }
+   c=0; do {
+      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap/2);
+   } while (++c<CC);
+
+   c=0; do {
+      int bound = M*eBands[effEnd];
+      if (st->downsample!=1)
+         bound = IMIN(bound, N/st->downsample);
+      for (i=bound;i<N;i++)
+         freq[c*N+i] = 0;
+   } while (++c<C);
+
+   c=0; do {
+      out_syn[c] = out_mem[c]+MAX_PERIOD-N;
+   } while (++c<CC);
+
+   if (CC==2&&C==1)
+   {
+      for (i=0;i<N;i++)
+         freq[N+i] = freq[i];
+   }
+   if (CC==1&&C==2)
+   {
+      for (i=0;i<N;i++)
+         freq[i] = HALF32(ADD32(freq[i],freq[N+i]));
+   }
+
+   /* Compute inverse MDCTs */
+   compute_inv_mdcts(mode, shortBlocks, freq, out_syn, CC, LM);
+
+   c=0; do {
+      st->postfilter_period=IMAX(st->postfilter_period, COMBFILTER_MINPERIOD);
+      st->postfilter_period_old=IMAX(st->postfilter_period_old, COMBFILTER_MINPERIOD);
+      comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old, st->postfilter_period, mode->shortMdctSize,
+            st->postfilter_gain_old, st->postfilter_gain, st->postfilter_tapset_old, st->postfilter_tapset,
+            mode->window, overlap);
+      if (LM!=0)
+         comb_filter(out_syn[c]+mode->shortMdctSize, out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch, N-mode->shortMdctSize,
+               st->postfilter_gain, postfilter_gain, st->postfilter_tapset, postfilter_tapset,
+               mode->window, overlap);
+
+   } while (++c<CC);
+   st->postfilter_period_old = st->postfilter_period;
+   st->postfilter_gain_old = st->postfilter_gain;
+   st->postfilter_tapset_old = st->postfilter_tapset;
+   st->postfilter_period = postfilter_pitch;
+   st->postfilter_gain = postfilter_gain;
+   st->postfilter_tapset = postfilter_tapset;
+   if (LM!=0)
+   {
+      st->postfilter_period_old = st->postfilter_period;
+      st->postfilter_gain_old = st->postfilter_gain;
+      st->postfilter_tapset_old = st->postfilter_tapset;
+   }
+
+   if (C==1) {
+      for (i=0;i<nbEBands;i++)
+         oldBandE[nbEBands+i]=oldBandE[i];
+   }
+
+   /* In case start or end were to change */
+   if (!isTransient)
+   {
+      for (i=0;i<2*nbEBands;i++)
+         oldLogE2[i] = oldLogE[i];
+      for (i=0;i<2*nbEBands;i++)
+         oldLogE[i] = oldBandE[i];
+      for (i=0;i<2*nbEBands;i++)
+         backgroundLogE[i] = MIN16(backgroundLogE[i] + M*QCONST16(0.001f,DB_SHIFT), oldBandE[i]);
+   } else {
+      for (i=0;i<2*nbEBands;i++)
+         oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
+   }
+   c=0; do
+   {
+      for (i=0;i<st->start;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
+      }
+      for (i=st->end;i<nbEBands;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
+      }
+   } while (++c<2);
+   st->rng = dec->rng;
+
+   /* We reuse freq[] as scratch space for the de-emphasis */
+   deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, freq);
+   st->loss_count = 0;
+   RESTORE_STACK;
+   if (ec_tell(dec) > 8*len)
+      return OPUS_INTERNAL_ERROR;
+   if(ec_get_error(dec))
+      st->error = 1;
+   return frame_size/st->downsample;
+}
+
+
+#ifdef CUSTOM_MODES
+
+#ifdef FIXED_POINT
+int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
+{
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL);
+}
+
+#ifndef DISABLE_FLOAT_API
+int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size)
+{
+   int j, ret, C, N;
+   VARDECL(opus_int16, out);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+
+   ALLOC(out, C*N, opus_int16);
+   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL);
+   if (ret>0)
+      for (j=0;j<C*ret;j++)
+         pcm[j]=out[j]*(1.f/32768.f);
+
+   RESTORE_STACK;
+   return ret;
+}
+#endif /* DISABLE_FLOAT_API */
+
+#else
+
+int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size)
+{
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL);
+}
+
+int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
+{
+   int j, ret, C, N;
+   VARDECL(celt_sig, out);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+   ALLOC(out, C*N, celt_sig);
+
+   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL);
+
+   if (ret>0)
+      for (j=0;j<C*ret;j++)
+         pcm[j] = FLOAT2INT16 (out[j]);
+
+   RESTORE_STACK;
+   return ret;
+}
+
+#endif
+#endif /* CUSTOM_MODES */
+
+int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...)
+{
+   va_list ap;
+
+   va_start(ap, request);
+   switch (request)
+   {
+      case CELT_SET_START_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<0 || value>=st->mode->nbEBands)
+            goto bad_arg;
+         st->start = value;
+      }
+      break;
+      case CELT_SET_END_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>st->mode->nbEBands)
+            goto bad_arg;
+         st->end = value;
+      }
+      break;
+      case CELT_SET_CHANNELS_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>2)
+            goto bad_arg;
+         st->stream_channels = value;
+      }
+      break;
+      case CELT_GET_AND_CLEAR_ERROR_REQUEST:
+      {
+         opus_int32 *value = va_arg(ap, opus_int32*);
+         if (value==NULL)
+            goto bad_arg;
+         *value=st->error;
+         st->error = 0;
+      }
+      break;
+      case OPUS_GET_LOOKAHEAD_REQUEST:
+      {
+         opus_int32 *value = va_arg(ap, opus_int32*);
+         if (value==NULL)
+            goto bad_arg;
+         *value = st->overlap/st->downsample;
+      }
+      break;
+      case OPUS_RESET_STATE:
+      {
+         int i;
+         opus_val16 *lpc, *oldBandE, *oldLogE, *oldLogE2;
+         lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels);
+         oldBandE = lpc+st->channels*LPC_ORDER;
+         oldLogE = oldBandE + 2*st->mode->nbEBands;
+         oldLogE2 = oldLogE + 2*st->mode->nbEBands;
+         OPUS_CLEAR((char*)&st->DECODER_RESET_START,
+               opus_custom_decoder_get_size(st->mode, st->channels)-
+               ((char*)&st->DECODER_RESET_START - (char*)st));
+         for (i=0;i<2*st->mode->nbEBands;i++)
+            oldLogE[i]=oldLogE2[i]=-QCONST16(28.f,DB_SHIFT);
+      }
+      break;
+      case OPUS_GET_PITCH_REQUEST:
+      {
+         opus_int32 *value = va_arg(ap, opus_int32*);
+         if (value==NULL)
+            goto bad_arg;
+         *value = st->postfilter_period;
+      }
+      break;
+      case CELT_GET_MODE_REQUEST:
+      {
+         const CELTMode ** value = va_arg(ap, const CELTMode**);
+         if (value==0)
+            goto bad_arg;
+         *value=st->mode;
+      }
+      break;
+      case CELT_SET_SIGNALLING_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->signalling = value;
+      }
+      break;
+      case OPUS_GET_FINAL_RANGE_REQUEST:
+      {
+         opus_uint32 * value = va_arg(ap, opus_uint32 *);
+         if (value==0)
+            goto bad_arg;
+         *value=st->rng;
+      }
+      break;
+      default:
+         goto bad_request;
+   }
+   va_end(ap);
+   return OPUS_OK;
+bad_arg:
+   va_end(ap);
+   return OPUS_BAD_ARG;
+bad_request:
+      va_end(ap);
+  return OPUS_UNIMPLEMENTED;
+}
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
new file mode 100644
index 0000000..59dcc5c
--- /dev/null
+++ b/celt/celt_encoder.c
@@ -0,0 +1,2331 @@
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2010 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define CELT_ENCODER_C
+
+#include "cpu_support.h"
+#include "os_support.h"
+#include "mdct.h"
+#include <math.h>
+#include "celt.h"
+#include "pitch.h"
+#include "bands.h"
+#include "modes.h"
+#include "entcode.h"
+#include "quant_bands.h"
+#include "rate.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "float_cast.h"
+#include <stdarg.h>
+#include "celt_lpc.h"
+#include "vq.h"
+
+
+/** Encoder state
+ @brief Encoder state
+ */
+struct OpusCustomEncoder {
+   const OpusCustomMode *mode;     /**< Mode used by the encoder */
+   int overlap;
+   int channels;
+   int stream_channels;
+
+   int force_intra;
+   int clip;
+   int disable_pf;
+   int complexity;
+   int upsample;
+   int start, end;
+
+   opus_int32 bitrate;
+   int vbr;
+   int signalling;
+   int constrained_vbr;      /* If zero, VBR can do whatever it likes with the rate */
+   int loss_rate;
+   int lsb_depth;
+   int variable_duration;
+   int lfe;
+   int arch;
+
+   /* Everything beyond this point gets cleared on a reset */
+#define ENCODER_RESET_START rng
+
+   opus_uint32 rng;
+   int spread_decision;
+   opus_val32 delayedIntra;
+   int tonal_average;
+   int lastCodedBands;
+   int hf_average;
+   int tapset_decision;
+
+   int prefilter_period;
+   opus_val16 prefilter_gain;
+   int prefilter_tapset;
+#ifdef RESYNTH
+   int prefilter_period_old;
+   opus_val16 prefilter_gain_old;
+   int prefilter_tapset_old;
+#endif
+   int consec_transient;
+   AnalysisInfo analysis;
+
+   opus_val32 preemph_memE[2];
+   opus_val32 preemph_memD[2];
+
+   /* VBR-related parameters */
+   opus_int32 vbr_reservoir;
+   opus_int32 vbr_drift;
+   opus_int32 vbr_offset;
+   opus_int32 vbr_count;
+   opus_val32 overlap_max;
+   opus_val16 stereo_saving;
+   int intensity;
+   opus_val16 *energy_mask;
+   opus_val16 spec_avg;
+
+#ifdef RESYNTH
+   /* +MAX_PERIOD/2 to make space for overlap */
+   celt_sig syn_mem[2][2*MAX_PERIOD+MAX_PERIOD/2];
+#endif
+
+   celt_sig in_mem[1]; /* Size = channels*mode->overlap */
+   /* celt_sig prefilter_mem[],  Size = channels*COMBFILTER_MAXPERIOD */
+   /* opus_val16 oldBandE[],     Size = channels*mode->nbEBands */
+   /* opus_val16 oldLogE[],      Size = channels*mode->nbEBands */
+   /* opus_val16 oldLogE2[],     Size = channels*mode->nbEBands */
+};
+
+int celt_encoder_get_size(int channels)
+{
+   CELTMode *mode = opus_custom_mode_create(48000, 960, NULL);
+   return opus_custom_encoder_get_size(mode, channels);
+}
+
+OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_get_size(const CELTMode *mode, int channels)
+{
+   int size = sizeof(struct CELTEncoder)
+         + (channels*mode->overlap-1)*sizeof(celt_sig)    /* celt_sig in_mem[channels*mode->overlap]; */
+         + channels*COMBFILTER_MAXPERIOD*sizeof(celt_sig) /* celt_sig prefilter_mem[channels*COMBFILTER_MAXPERIOD]; */
+         + 3*channels*mode->nbEBands*sizeof(opus_val16);  /* opus_val16 oldBandE[channels*mode->nbEBands]; */
+                                                          /* opus_val16 oldLogE[channels*mode->nbEBands]; */
+                                                          /* opus_val16 oldLogE2[channels*mode->nbEBands]; */
+   return size;
+}
+
+#ifdef CUSTOM_MODES
+CELTEncoder *opus_custom_encoder_create(const CELTMode *mode, int channels, int *error)
+{
+   int ret;
+   CELTEncoder *st = (CELTEncoder *)opus_alloc(opus_custom_encoder_get_size(mode, channels));
+   /* init will handle the NULL case */
+   ret = opus_custom_encoder_init(st, mode, channels);
+   if (ret != OPUS_OK)
+   {
+      opus_custom_encoder_destroy(st);
+      st = NULL;
+   }
+   if (error)
+      *error = ret;
+   return st;
+}
+#endif /* CUSTOM_MODES */
+
+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels)
+{
+   int ret;
+   ret = opus_custom_encoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels);
+   if (ret != OPUS_OK)
+      return ret;
+   st->upsample = resampling_factor(sampling_rate);
+   return OPUS_OK;
+}
+
+OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels)
+{
+   if (channels < 0 || channels > 2)
+      return OPUS_BAD_ARG;
+
+   if (st==NULL || mode==NULL)
+      return OPUS_ALLOC_FAIL;
+
+   OPUS_CLEAR((char*)st, opus_custom_encoder_get_size(mode, channels));
+
+   st->mode = mode;
+   st->overlap = mode->overlap;
+   st->stream_channels = st->channels = channels;
+
+   st->upsample = 1;
+   st->start = 0;
+   st->end = st->mode->effEBands;
+   st->signalling = 1;
+
+   st->arch = opus_select_arch();
+
+   st->constrained_vbr = 1;
+   st->clip = 1;
+
+   st->bitrate = OPUS_BITRATE_MAX;
+   st->vbr = 0;
+   st->force_intra  = 0;
+   st->complexity = 5;
+   st->lsb_depth=24;
+
+   opus_custom_encoder_ctl(st, OPUS_RESET_STATE);
+
+   return OPUS_OK;
+}
+
+#ifdef CUSTOM_MODES
+void opus_custom_encoder_destroy(CELTEncoder *st)
+{
+   opus_free(st);
+}
+#endif /* CUSTOM_MODES */
+
+
+static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int C,
+                              opus_val16 *tf_estimate, int *tf_chan)
+{
+   int i;
+   VARDECL(opus_val16, tmp);
+   opus_val32 mem0,mem1;
+   int is_transient = 0;
+   opus_int32 mask_metric = 0;
+   int c;
+   opus_val16 tf_max;
+   int len2;
+   /* Table of 6*64/x, trained on real data to minimize the average error */
+   static const unsigned char inv_table[128] = {
+         255,255,156,110, 86, 70, 59, 51, 45, 40, 37, 33, 31, 28, 26, 25,
+          23, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12,
+          12, 12, 11, 11, 11, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,
+           8,  8,  8,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,
+           6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,
+           5,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+           4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,
+           3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,
+   };
+   SAVE_STACK;
+   ALLOC(tmp, len, opus_val16);
+
+   len2=len/2;
+   tf_max = 0;
+   for (c=0;c<C;c++)
+   {
+      opus_val32 mean;
+      opus_int32 unmask=0;
+      opus_val32 norm;
+      opus_val16 maxE;
+      mem0=0;
+      mem1=0;
+      /* High-pass filter: (1 - 2*z^-1 + z^-2) / (1 - z^-1 + .5*z^-2) */
+      for (i=0;i<len;i++)
+      {
+         opus_val32 x,y;
+         x = SHR32(in[i+c*len],SIG_SHIFT);
+         y = ADD32(mem0, x);
+#ifdef FIXED_POINT
+         mem0 = mem1 + y - SHL32(x,1);
+         mem1 = x - SHR32(y,1);
+#else
+         mem0 = mem1 + y - 2*x;
+         mem1 = x - .5f*y;
+#endif
+         tmp[i] = EXTRACT16(SHR32(y,2));
+         /*printf("%f ", tmp[i]);*/
+      }
+      /*printf("\n");*/
+      /* First few samples are bad because we don't propagate the memory */
+      for (i=0;i<12;i++)
+         tmp[i] = 0;
+
+#ifdef FIXED_POINT
+      /* Normalize tmp to max range */
+      {
+         int shift=0;
+         shift = 14-celt_ilog2(1+celt_maxabs16(tmp, len));
+         if (shift!=0)
+         {
+            for (i=0;i<len;i++)
+               tmp[i] = SHL16(tmp[i], shift);
+         }
+      }
+#endif
+
+      mean=0;
+      mem0=0;
+      /* Grouping by two to reduce complexity */
+      /* Forward pass to compute the post-echo threshold*/
+      for (i=0;i<len2;i++)
+      {
+         opus_val16 x2 = PSHR32(MULT16_16(tmp[2*i],tmp[2*i]) + MULT16_16(tmp[2*i+1],tmp[2*i+1]),16);
+         mean += x2;
+#ifdef FIXED_POINT
+         /* FIXME: Use PSHR16() instead */
+         tmp[i] = mem0 + PSHR32(x2-mem0,4);
+#else
+         tmp[i] = mem0 + MULT16_16_P15(QCONST16(.0625f,15),x2-mem0);
+#endif
+         mem0 = tmp[i];
+      }
+
+      mem0=0;
+      maxE=0;
+      /* Backward pass to compute the pre-echo threshold */
+      for (i=len2-1;i>=0;i--)
+      {
+#ifdef FIXED_POINT
+         /* FIXME: Use PSHR16() instead */
+         tmp[i] = mem0 + PSHR32(tmp[i]-mem0,3);
+#else
+         tmp[i] = mem0 + MULT16_16_P15(QCONST16(0.125f,15),tmp[i]-mem0);
+#endif
+         mem0 = tmp[i];
+         maxE = MAX16(maxE, mem0);
+      }
+      /*for (i=0;i<len2;i++)printf("%f ", tmp[i]/mean);printf("\n");*/
+
+      /* Compute the ratio of the "frame energy" over the harmonic mean of the energy.
+         This essentially corresponds to a bitrate-normalized temporal noise-to-mask
+         ratio */
+
+      /* As a compromise with the old transient detector, frame energy is the
+         geometric mean of the energy and half the max */
+#ifdef FIXED_POINT
+      /* Costs two sqrt() to avoid overflows */
+      mean = MULT16_16(celt_sqrt(mean), celt_sqrt(MULT16_16(maxE,len2>>1)));
+#else
+      mean = celt_sqrt(mean * maxE*.5*len2);
+#endif
+      /* Inverse of the mean energy in Q15+6 */
+      norm = SHL32(EXTEND32(len2),6+14)/ADD32(EPSILON,SHR32(mean,1));
+      /* Compute harmonic mean discarding the unreliable boundaries
+         The data is smooth, so we only take 1/4th of the samples */
+      unmask=0;
+      for (i=12;i<len2-5;i+=4)
+      {
+         int id;
+#ifdef FIXED_POINT
+         id = IMAX(0,IMIN(127,MULT16_32_Q15(tmp[i],norm))); /* Do not round to nearest */
+#else
+         id = IMAX(0,IMIN(127,(int)floor(64*norm*tmp[i]))); /* Do not round to nearest */
+#endif
+         unmask += inv_table[id];
+      }
+      /*printf("%d\n", unmask);*/
+      /* Normalize, compensate for the 1/4th of the sample and the factor of 6 in the inverse table */
+      unmask = 64*unmask*4/(6*(len2-17));
+      if (unmask>mask_metric)
+      {
+         *tf_chan = c;
+         mask_metric = unmask;
+      }
+   }
+   is_transient = mask_metric>200;
+
+   /* Arbitrary metric for VBR boost */
+   tf_max = MAX16(0,celt_sqrt(27*mask_metric)-42);
+   /* *tf_estimate = 1 + MIN16(1, sqrt(MAX16(0, tf_max-30))/20); */
+   *tf_estimate = celt_sqrt(MAX16(0, SHL32(MULT16_16(QCONST16(0.0069,14),MIN16(163,tf_max)),14)-QCONST32(0.139,28)));
+   /*printf("%d %f\n", tf_max, mask_metric);*/
+   RESTORE_STACK;
+#ifdef FUZZING
+   is_transient = rand()&0x1;
+#endif
+   /*printf("%d %f %d\n", is_transient, (float)*tf_estimate, tf_max);*/
+   return is_transient;
+}
+
+/* Looks for sudden increases of energy to decide whether we need to patch
+   the transient decision */
+int patch_transient_decision(opus_val16 *newE, opus_val16 *oldE, int nbEBands,
+      int end, int C)
+{
+   int i, c;
+   opus_val32 mean_diff=0;
+   opus_val16 spread_old[26];
+   /* Apply an aggressive (-6 dB/Bark) spreading function to the old frame to
+      avoid false detection caused by irrelevant bands */
+   if (C==1)
+   {
+      spread_old[0] = oldE[0];
+      for (i=1;i<end;i++)
+         spread_old[i] = MAX16(spread_old[i-1]-QCONST16(1.0f, DB_SHIFT), oldE[i]);
+   } else {
+      spread_old[0] = MAX16(oldE[0],oldE[nbEBands]);
+      for (i=1;i<end;i++)
+         spread_old[i] = MAX16(spread_old[i-1]-QCONST16(1.0f, DB_SHIFT),
+                               MAX16(oldE[i],oldE[i+nbEBands]));
+   }
+   for (i=end-2;i>=0;i--)
+      spread_old[i] = MAX16(spread_old[i], spread_old[i+1]-QCONST16(1.0f, DB_SHIFT));
+   /* Compute mean increase */
+   c=0; do {
+      for (i=2;i<end-1;i++)
+      {
+         opus_val16 x1, x2;
+         x1 = MAX16(0, newE[i]);
+         x2 = MAX16(0, spread_old[i]);
+         mean_diff = ADD32(mean_diff, EXTEND32(MAX16(0, SUB16(x1, x2))));
+      }
+   } while (++c<C);
+   mean_diff = DIV32(mean_diff, C*(end-3));
+   /*printf("%f %f %d\n", mean_diff, max_diff, count);*/
+   return mean_diff > QCONST16(1.f, DB_SHIFT);
+}
+
+/** Apply window and compute the MDCT for all sub-frames and
+    all channels in a frame */
+static void compute_mdcts(const CELTMode *mode, int shortBlocks, celt_sig * OPUS_RESTRICT in,
+                          celt_sig * OPUS_RESTRICT out, int C, int CC, int LM, int upsample)
+{
+   const int overlap = OVERLAP(mode);
+   int N;
+   int B;
+   int shift;
+   int i, b, c;
+   if (shortBlocks)
+   {
+      B = shortBlocks;
+      N = mode->shortMdctSize;
+      shift = mode->maxLM;
+   } else {
+      B = 1;
+      N = mode->shortMdctSize<<LM;
+      shift = mode->maxLM-LM;
+   }
+   c=0; do {
+      for (b=0;b<B;b++)
+      {
+         /* Interleaving the sub-frames while doing the MDCTs */
+         clt_mdct_forward(&mode->mdct, in+c*(B*N+overlap)+b*N, &out[b+c*N*B], mode->window, overlap, shift, B);
+      }
+   } while (++c<CC);
+   if (CC==2&&C==1)
+   {
+      for (i=0;i<B*N;i++)
+         out[i] = ADD32(HALF32(out[i]), HALF32(out[B*N+i]));
+   }
+   if (upsample != 1)
+   {
+      c=0; do
+      {
+         int bound = B*N/upsample;
+         for (i=0;i<bound;i++)
+            out[c*B*N+i] *= upsample;
+         for (;i<B*N;i++)
+            out[c*B*N+i] = 0;
+      } while (++c<C);
+   }
+}
+
+
+void preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
+                        int N, int CC, int upsample, const opus_val16 *coef, celt_sig *mem, int clip)
+{
+   int i;
+   opus_val16 coef0;
+   celt_sig m;
+   int Nu;
+
+   coef0 = coef[0];
+
+
+   Nu = N/upsample;
+   if (upsample!=1)
+   {
+      for (i=0;i<N;i++)
+         inp[i] = 0;
+   }
+   for (i=0;i<Nu;i++)
+   {
+      celt_sig x;
+
+      x = SCALEIN(pcmp[CC*i]);
+#ifndef FIXED_POINT
+      /* Replace NaNs with zeros */
+      if (!(x==x))
+         x = 0;
+#endif
+      inp[i*upsample] = x;
+   }
+
+#ifndef FIXED_POINT
+   if (clip)
+   {
+      /* Clip input to avoid encoding non-portable files */
+      for (i=0;i<Nu;i++)
+         inp[i*upsample] = MAX32(-65536.f, MIN32(65536.f,inp[i*upsample]));
+   }
+#endif
+   m = *mem;
+#ifdef CUSTOM_MODES
+   if (coef[1] != 0)
+   {
+      opus_val16 coef1 = coef[1];
+      opus_val16 coef2 = coef[2];
+      for (i=0;i<N;i++)
+      {
+         opus_val16 x, tmp;
+         x = inp[i];
+         /* Apply pre-emphasis */
+         tmp = MULT16_16(coef2, x);
+         inp[i] = tmp + m;
+         m = MULT16_32_Q15(coef1, inp[i]) - MULT16_32_Q15(coef0, tmp);
+      }
+   } else
+#endif
+   {
+      for (i=0;i<N;i++)
+      {
+         celt_sig x;
+         x = SHL32(inp[i], SIG_SHIFT);
+         /* Apply pre-emphasis */
+         inp[i] = x + m;
+         m = - MULT16_32_Q15(coef0, x);
+      }
+   }
+   *mem = m;
+}
+
+
+
+static opus_val32 l1_metric(const celt_norm *tmp, int N, int LM, opus_val16 bias)
+{
+   int i;
+   opus_val32 L1;
+   L1 = 0;
+   for (i=0;i<N;i++)
+      L1 += EXTEND32(ABS16(tmp[i]));
+   /* When in doubt, prefer good freq resolution */
+   L1 = MAC16_32_Q15(L1, LM*bias, L1);
+   return L1;
+
+}
+
+static int tf_analysis(const CELTMode *m, int len, int isTransient,
+      int *tf_res, int lambda, celt_norm *X, int N0, int LM,
+      int *tf_sum, opus_val16 tf_estimate, int tf_chan)
+{
+   int i;
+   VARDECL(int, metric);
+   int cost0;
+   int cost1;
+   VARDECL(int, path0);
+   VARDECL(int, path1);
+   VARDECL(celt_norm, tmp);
+   VARDECL(celt_norm, tmp_1);
+   int sel;
+   int selcost[2];
+   int tf_select=0;
+   opus_val16 bias;
+
+   SAVE_STACK;
+   bias = MULT16_16_Q14(QCONST16(.04f,15), MAX16(-QCONST16(.25f,14), QCONST16(.5f,14)-tf_estimate));
+   /*printf("%f ", bias);*/
+
+   ALLOC(metric, len, int);
+   ALLOC(tmp, (m->eBands[len]-m->eBands[len-1])<<LM, celt_norm);
+   ALLOC(tmp_1, (m->eBands[len]-m->eBands[len-1])<<LM, celt_norm);
+   ALLOC(path0, len, int);
+   ALLOC(path1, len, int);
+
+   *tf_sum = 0;
+   for (i=0;i<len;i++)
+   {
+      int j, k, N;
+      int narrow;
+      opus_val32 L1, best_L1;
+      int best_level=0;
+      N = (m->eBands[i+1]-m->eBands[i])<<LM;
+      /* band is too narrow to be split down to LM=-1 */
+      narrow = (m->eBands[i+1]-m->eBands[i])==1;
+      for (j=0;j<N;j++)
+         tmp[j] = X[tf_chan*N0 + j+(m->eBands[i]<<LM)];
+      /* Just add the right channel if we're in stereo */
+      /*if (C==2)
+         for (j=0;j<N;j++)
+            tmp[j] = ADD16(SHR16(tmp[j], 1),SHR16(X[N0+j+(m->eBands[i]<<LM)], 1));*/
+      L1 = l1_metric(tmp, N, isTransient ? LM : 0, bias);
+      best_L1 = L1;
+      /* Check the -1 case for transients */
+      if (isTransient && !narrow)
+      {
+         for (j=0;j<N;j++)
+            tmp_1[j] = tmp[j];
+         haar1(tmp_1, N>>LM, 1<<LM);
+         L1 = l1_metric(tmp_1, N, LM+1, bias);
+         if (L1<best_L1)
+         {
+            best_L1 = L1;
+            best_level = -1;
+         }
+      }
+      /*printf ("%f ", L1);*/
+      for (k=0;k<LM+!(isTransient||narrow);k++)
+      {
+         int B;
+
+         if (isTransient)
+            B = (LM-k-1);
+         else
+            B = k+1;
+
+         haar1(tmp, N>>k, 1<<k);
+
+         L1 = l1_metric(tmp, N, B, bias);
+
+         if (L1 < best_L1)
+         {
+            best_L1 = L1;
+            best_level = k+1;
+         }
+      }
+      /*printf ("%d ", isTransient ? LM-best_level : best_level);*/
+      /* metric is in Q1 to be able to select the mid-point (-0.5) for narrower bands */
+      if (isTransient)
+         metric[i] = 2*best_level;
+      else
+         metric[i] = -2*best_level;
+      *tf_sum += (isTransient ? LM : 0) - metric[i]/2;
+      /* For bands that can't be split to -1, set the metric to the half-way point to avoid
+         biasing the decision */
+      if (narrow && (metric[i]==0 || metric[i]==-2*LM))
+         metric[i]-=1;
+      /*printf("%d ", metric[i]);*/
+   }
+   /*printf("\n");*/
+   /* Search for the optimal tf resolution, including tf_select */
+   tf_select = 0;
+   for (sel=0;sel<2;sel++)
+   {
+      cost0 = 0;
+      cost1 = isTransient ? 0 : lambda;
+      for (i=1;i<len;i++)
+      {
+         int curr0, curr1;
+         curr0 = IMIN(cost0, cost1 + lambda);
+         curr1 = IMIN(cost0 + lambda, cost1);
+         cost0 = curr0 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*sel+0]);
+         cost1 = curr1 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*sel+1]);
+      }
+      cost0 = IMIN(cost0, cost1);
+      selcost[sel]=cost0;
+   }
+   /* For now, we're conservative and only allow tf_select=1 for transients.
+    * If tests confirm it's useful for non-transients, we could allow it. */
+   if (selcost[1]<selcost[0] && isTransient)
+      tf_select=1;
+   cost0 = 0;
+   cost1 = isTransient ? 0 : lambda;
+   /* Viterbi forward pass */
+   for (i=1;i<len;i++)
+   {
+      int curr0, curr1;
+      int from0, from1;
+
+      from0 = cost0;
+      from1 = cost1 + lambda;
+      if (from0 < from1)
+      {
+         curr0 = from0;
+         path0[i]= 0;
+      } else {
+         curr0 = from1;
+         path0[i]= 1;
+      }
+
+      from0 = cost0 + lambda;
+      from1 = cost1;
+      if (from0 < from1)
+      {
+         curr1 = from0;
+         path1[i]= 0;
+      } else {
+         curr1 = from1;
+         path1[i]= 1;
+      }
+      cost0 = curr0 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*tf_select+0]);
+      cost1 = curr1 + abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*tf_select+1]);
+   }
+   tf_res[len-1] = cost0 < cost1 ? 0 : 1;
+   /* Viterbi backward pass to check the decisions */
+   for (i=len-2;i>=0;i--)
+   {
+      if (tf_res[i+1] == 1)
+         tf_res[i] = path1[i+1];
+      else
+         tf_res[i] = path0[i+1];
+   }
+   /*printf("%d %f\n", *tf_sum, tf_estimate);*/
+   RESTORE_STACK;
+#ifdef FUZZING
+   tf_select = rand()&0x1;
+   tf_res[0] = rand()&0x1;
+   for (i=1;i<len;i++)
+      tf_res[i] = tf_res[i-1] ^ ((rand()&0xF) == 0);
+#endif
+   return tf_select;
+}
+
+static void tf_encode(int start, int end, int isTransient, int *tf_res, int LM, int tf_select, ec_enc *enc)
+{
+   int curr, i;
+   int tf_select_rsv;
+   int tf_changed;
+   int logp;
+   opus_uint32 budget;
+   opus_uint32 tell;
+   budget = enc->storage*8;
+   tell = ec_tell(enc);
+   logp = isTransient ? 2 : 4;
+   /* Reserve space to code the tf_select decision. */
+   tf_select_rsv = LM>0 && tell+logp+1 <= budget;
+   budget -= tf_select_rsv;
+   curr = tf_changed = 0;
+   for (i=start;i<end;i++)
+   {
+      if (tell+logp<=budget)
+      {
+         ec_enc_bit_logp(enc, tf_res[i] ^ curr, logp);
+         tell = ec_tell(enc);
+         curr = tf_res[i];
+         tf_changed |= curr;
+      }
+      else
+         tf_res[i] = curr;
+      logp = isTransient ? 4 : 5;
+   }
+   /* Only code tf_select if it would actually make a difference. */
+   if (tf_select_rsv &&
+         tf_select_table[LM][4*isTransient+0+tf_changed]!=
+         tf_select_table[LM][4*isTransient+2+tf_changed])
+      ec_enc_bit_logp(enc, tf_select, 1);
+   else
+      tf_select = 0;
+   for (i=start;i<end;i++)
+      tf_res[i] = tf_select_table[LM][4*isTransient+2*tf_select+tf_res[i]];
+   /*for(i=0;i<end;i++)printf("%d ", isTransient ? tf_res[i] : LM+tf_res[i]);printf("\n");*/
+}
+
+
+static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
+      const opus_val16 *bandLogE, int end, int LM, int C, int N0,
+      AnalysisInfo *analysis, opus_val16 *stereo_saving, opus_val16 tf_estimate,
+      int intensity, opus_val16 surround_trim)
+{
+   int i;
+   opus_val32 diff=0;
+   int c;
+   int trim_index = 5;
+   opus_val16 trim = QCONST16(5.f, 8);
+   opus_val16 logXC, logXC2;
+   if (C==2)
+   {
+      opus_val16 sum = 0; /* Q10 */
+      opus_val16 minXC; /* Q10 */
+      /* Compute inter-channel correlation for low frequencies */
+      for (i=0;i<8;i++)
+      {
+         int j;
+         opus_val32 partial = 0;
+         for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
+            partial = MAC16_16(partial, X[j], X[N0+j]);
+         sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
+      }
+      sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
+      sum = MIN16(QCONST16(1.f, 10), ABS16(sum));
+      minXC = sum;
+      for (i=8;i<intensity;i++)
+      {
+         int j;
+         opus_val32 partial = 0;
+         for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
+            partial = MAC16_16(partial, X[j], X[N0+j]);
+         minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18))));
+      }
+      minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC));
+      /*printf ("%f\n", sum);*/
+      if (sum > QCONST16(.995f,10))
+         trim_index-=4;
+      else if (sum > QCONST16(.92f,10))
+         trim_index-=3;
+      else if (sum > QCONST16(.85f,10))
+         trim_index-=2;
+      else if (sum > QCONST16(.8f,10))
+         trim_index-=1;
+      /* mid-side savings estimations based on the LF average*/
+      logXC = celt_log2(QCONST32(1.001f, 20)-MULT16_16(sum, sum));
+      /* mid-side savings estimations based on min correlation */
+      logXC2 = MAX16(HALF16(logXC), celt_log2(QCONST32(1.001f, 20)-MULT16_16(minXC, minXC)));
+#ifdef FIXED_POINT
+      /* Compensate for Q20 vs Q14 input and convert output to Q8 */
+      logXC = PSHR32(logXC-QCONST16(6.f, DB_SHIFT),DB_SHIFT-8);
+      logXC2 = PSHR32(logXC2-QCONST16(6.f, DB_SHIFT),DB_SHIFT-8);
+#endif
+
+      trim += MAX16(-QCONST16(4.f, 8), MULT16_16_Q15(QCONST16(.75f,15),logXC));
+      *stereo_saving = MIN16(*stereo_saving + QCONST16(0.25f, 8), -HALF16(logXC2));
+   }
+
+   /* Estimate spectral tilt */
+   c=0; do {
+      for (i=0;i<end-1;i++)
+      {
+         diff += bandLogE[i+c*m->nbEBands]*(opus_int32)(2+2*i-end);
+      }
+   } while (++c<C);
+   diff /= C*(end-1);
+   /*printf("%f\n", diff);*/
+   if (diff > QCONST16(2.f, DB_SHIFT))
+      trim_index--;
+   if (diff > QCONST16(8.f, DB_SHIFT))
+      trim_index--;
+   if (diff < -QCONST16(4.f, DB_SHIFT))
+      trim_index++;
+   if (diff < -QCONST16(10.f, DB_SHIFT))
+      trim_index++;
+   trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8), SHR16(diff+QCONST16(1.f, DB_SHIFT),DB_SHIFT-8)/6 ));
+   trim -= SHR16(surround_trim, DB_SHIFT-8);
+   trim -= 2*SHR16(tf_estimate, 14-8);
+#ifndef DISABLE_FLOAT_API
+   if (analysis->valid)
+   {
+      trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8), QCONST16(2.f, 8)*(analysis->tonality_slope+.05f)));
+   }
+#endif
+
+#ifdef FIXED_POINT
+   trim_index = PSHR32(trim, 8);
+#else
+   trim_index = (int)floor(.5f+trim);
+#endif
+   if (trim_index<0)
+      trim_index = 0;
+   if (trim_index>10)
+      trim_index = 10;
+   /*printf("%d\n", trim_index);*/
+#ifdef FUZZING
+   trim_index = rand()%11;
+#endif
+   return trim_index;
+}
+
+static int stereo_analysis(const CELTMode *m, const celt_norm *X,
+      int LM, int N0)
+{
+   int i;
+   int thetas;
+   opus_val32 sumLR = EPSILON, sumMS = EPSILON;
+
+   /* Use the L1 norm to model the entropy of the L/R signal vs the M/S signal */
+   for (i=0;i<13;i++)
+   {
+      int j;
+      for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
+      {
+         opus_val32 L, R, M, S;
+         /* We cast to 32-bit first because of the -32768 case */
+         L = EXTEND32(X[j]);
+         R = EXTEND32(X[N0+j]);
+         M = ADD32(L, R);
+         S = SUB32(L, R);
+         sumLR = ADD32(sumLR, ADD32(ABS32(L), ABS32(R)));
+         sumMS = ADD32(sumMS, ADD32(ABS32(M), ABS32(S)));
+      }
+   }
+   sumMS = MULT16_32_Q15(QCONST16(0.707107f, 15), sumMS);
+   thetas = 13;
+   /* We don't need thetas for lower bands with LM<=1 */
+   if (LM<=1)
+      thetas -= 8;
+   return MULT16_32_Q15((m->eBands[13]<<(LM+1))+thetas, sumMS)
+         > MULT16_32_Q15(m->eBands[13]<<(LM+1), sumLR);
+}
+
+static opus_val16 dynalloc_analysis(const opus_val16 *bandLogE, const opus_val16 *bandLogE2,
+      int nbEBands, int start, int end, int C, int *offsets, int lsb_depth, const opus_int16 *logN,
+      int isTransient, int vbr, int constrained_vbr, const opus_int16 *eBands, int LM,
+      int effectiveBytes, opus_int32 *tot_boost_, int lfe, opus_val16 *surround_dynalloc)
+{
+   int i, c;
+   opus_int32 tot_boost=0;
+   opus_val16 maxDepth;
+   VARDECL(opus_val16, follower);
+   VARDECL(opus_val16, noise_floor);
+   SAVE_STACK;
+   ALLOC(follower, C*nbEBands, opus_val16);
+   ALLOC(noise_floor, C*nbEBands, opus_val16);
+   for (i=0;i<nbEBands;i++)
+      offsets[i] = 0;
+   /* Dynamic allocation code */
+   maxDepth=-QCONST16(31.9f, DB_SHIFT);
+   for (i=0;i<end;i++)
+   {
+      /* Noise floor must take into account eMeans, the depth, the width of the bands
+         and the preemphasis filter (approx. square of bark band ID) */
+      noise_floor[i] = MULT16_16(QCONST16(0.0625f, DB_SHIFT),logN[i])
+            +QCONST16(.5f,DB_SHIFT)+SHL16(9-lsb_depth,DB_SHIFT)-SHL16(eMeans[i],6)
+            +MULT16_16(QCONST16(.0062,DB_SHIFT),(i+5)*(i+5));
+   }
+   c=0;do
+   {
+      for (i=0;i<end;i++)
+         maxDepth = MAX16(maxDepth, bandLogE[c*nbEBands+i]-noise_floor[i]);
+   } while (++c<C);
+   /* Make sure that dynamic allocation can't make us bust the budget */
+   if (effectiveBytes > 50 && LM>=1 && !lfe)
+   {
+      int last=0;
+      c=0;do
+      {
+         follower[c*nbEBands] = bandLogE2[c*nbEBands];
+         for (i=1;i<end;i++)
+         {
+            /* The last band to be at least 3 dB higher than the previous one
+               is the last we'll consider. Otherwise, we run into problems on
+               bandlimited signals. */
+            if (bandLogE2[c*nbEBands+i] > bandLogE2[c*nbEBands+i-1]+QCONST16(.5f,DB_SHIFT))
+               last=i;
+            follower[c*nbEBands+i] = MIN16(follower[c*nbEBands+i-1]+QCONST16(1.5f,DB_SHIFT), bandLogE2[c*nbEBands+i]);
+         }
+         for (i=last-1;i>=0;i--)
+            follower[c*nbEBands+i] = MIN16(follower[c*nbEBands+i], MIN16(follower[c*nbEBands+i+1]+QCONST16(2.f,DB_SHIFT), bandLogE2[c*nbEBands+i]));
+         for (i=0;i<end;i++)
+            follower[c*nbEBands+i] = MAX16(follower[c*nbEBands+i], noise_floor[i]);
+      } while (++c<C);
+      if (C==2)
+      {
+         for (i=start;i<end;i++)
+         {
+            /* Consider 24 dB "cross-talk" */
+            follower[nbEBands+i] = MAX16(follower[nbEBands+i], follower[         i]-QCONST16(4.f,DB_SHIFT));
+            follower[         i] = MAX16(follower[         i], follower[nbEBands+i]-QCONST16(4.f,DB_SHIFT));
+            follower[i] = HALF16(MAX16(0, bandLogE[i]-follower[i]) + MAX16(0, bandLogE[nbEBands+i]-follower[nbEBands+i]));
+         }
+      } else {
+         for (i=start;i<end;i++)
+         {
+            follower[i] = MAX16(0, bandLogE[i]-follower[i]);
+         }
+      }
+      for (i=start;i<end;i++)
+         follower[i] = MAX16(follower[i], surround_dynalloc[i]);
+      /* For non-transient CBR/CVBR frames, halve the dynalloc contribution */
+      if ((!vbr || constrained_vbr)&&!isTransient)
+      {
+         for (i=start;i<end;i++)
+            follower[i] = HALF16(follower[i]);
+      }
+      for (i=start;i<end;i++)
+      {
+         int width;
+         int boost;
+         int boost_bits;
+
+         if (i<8)
+            follower[i] *= 2;
+         if (i>=12)
+            follower[i] = HALF16(follower[i]);
+         follower[i] = MIN16(follower[i], QCONST16(4, DB_SHIFT));
+
+         width = C*(eBands[i+1]-eBands[i])<<LM;
+         if (width<6)
+         {
+            boost = (int)SHR32(EXTEND32(follower[i]),DB_SHIFT);
+            boost_bits = boost*width<<BITRES;
+         } else if (width > 48) {
+            boost = (int)SHR32(EXTEND32(follower[i])*8,DB_SHIFT);
+            boost_bits = (boost*width<<BITRES)/8;
+         } else {
+            boost = (int)SHR32(EXTEND32(follower[i])*width/6,DB_SHIFT);
+            boost_bits = boost*6<<BITRES;
+         }
+         /* For CBR and non-transient CVBR frames, limit dynalloc to 1/4 of the bits */
+         if ((!vbr || (constrained_vbr&&!isTransient))
+               && (tot_boost+boost_bits)>>BITRES>>3 > effectiveBytes/4)
+         {
+            opus_int32 cap = ((effectiveBytes/4)<<BITRES<<3);
+            offsets[i] = cap-tot_boost;
+            tot_boost = cap;
+            break;
+         } else {
+            offsets[i] = boost;
+            tot_boost += boost_bits;
+         }
+      }
+   }
+   *tot_boost_ = tot_boost;
+   RESTORE_STACK;
+   return maxDepth;
+}
+
+
+static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, int CC, int N,
+      int prefilter_tapset, int *pitch, opus_val16 *gain, int *qgain, int enabled, int nbAvailableBytes)
+{
+   int c;
+   VARDECL(celt_sig, _pre);
+   celt_sig *pre[2];
+   const CELTMode *mode;
+   int pitch_index;
+   opus_val16 gain1;
+   opus_val16 pf_threshold;
+   int pf_on;
+   int qg;
+   SAVE_STACK;
+
+   mode = st->mode;
+   ALLOC(_pre, CC*(N+COMBFILTER_MAXPERIOD), celt_sig);
+
+   pre[0] = _pre;
+   pre[1] = _pre + (N+COMBFILTER_MAXPERIOD);
+
+
+   c=0; do {
+      OPUS_COPY(pre[c], prefilter_mem+c*COMBFILTER_MAXPERIOD, COMBFILTER_MAXPERIOD);
+      OPUS_COPY(pre[c]+COMBFILTER_MAXPERIOD, in+c*(N+st->overlap)+st->overlap, N);
+   } while (++c<CC);
+
+   if (enabled)
+   {
+      VARDECL(opus_val16, pitch_buf);
+      ALLOC(pitch_buf, (COMBFILTER_MAXPERIOD+N)>>1, opus_val16);
+
+      pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC);
+      /* Don't search for the fir last 1.5 octave of the range because
+         there's too many false-positives due to short-term correlation */
+      pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N,
+            COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index);
+      pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
+
+      gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
+            N, &pitch_index, st->prefilter_period, st->prefilter_gain);
+      if (pitch_index > COMBFILTER_MAXPERIOD-2)
+         pitch_index = COMBFILTER_MAXPERIOD-2;
+      gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1);
+      /*printf("%d %d %f %f\n", pitch_change, pitch_index, gain1, st->analysis.tonality);*/
+      if (st->loss_rate>2)
+         gain1 = HALF32(gain1);
+      if (st->loss_rate>4)
+         gain1 = HALF32(gain1);
+      if (st->loss_rate>8)
+         gain1 = 0;
+   } else {
+      gain1 = 0;
+      pitch_index = COMBFILTER_MINPERIOD;
+   }
+
+   /* Gain threshold for enabling the prefilter/postfilter */
+   pf_threshold = QCONST16(.2f,15);
+
+   /* Adjusting the threshold based on rate and continuity */
+   if (abs(pitch_index-st->prefilter_period)*10>pitch_index)
+      pf_threshold += QCONST16(.2f,15);
+   if (nbAvailableBytes<25)
+      pf_threshold += QCONST16(.1f,15);
+   if (nbAvailableBytes<35)
+      pf_threshold += QCONST16(.1f,15);
+   if (st->prefilter_gain > QCONST16(.4f,15))
+      pf_threshold -= QCONST16(.1f,15);
+   if (st->prefilter_gain > QCONST16(.55f,15))
+      pf_threshold -= QCONST16(.1f,15);
+
+   /* Hard threshold at 0.2 */
+   pf_threshold = MAX16(pf_threshold, QCONST16(.2f,15));
+   if (gain1<pf_threshold)
+   {
+      gain1 = 0;
+      pf_on = 0;
+      qg = 0;
+   } else {
+      /*This block is not gated by a total bits check only because
+        of the nbAvailableBytes check above.*/
+      if (ABS16(gain1-st->prefilter_gain)<QCONST16(.1f,15))
+         gain1=st->prefilter_gain;
+
+#ifdef FIXED_POINT
+      qg = ((gain1+1536)>>10)/3-1;
+#else
+      qg = (int)floor(.5f+gain1*32/3)-1;
+#endif
+      qg = IMAX(0, IMIN(7, qg));
+      gain1 = QCONST16(0.09375f,15)*(qg+1);
+      pf_on = 1;
+   }
+   /*printf("%d %f\n", pitch_index, gain1);*/
+
+   c=0; do {
+      int offset = mode->shortMdctSize-st->overlap;
+      st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD);
+      OPUS_COPY(in+c*(N+st->overlap), st->in_mem+c*(st->overlap), st->overlap);
+      if (offset)
+         comb_filter(in+c*(N+st->overlap)+st->overlap, pre[c]+COMBFILTER_MAXPERIOD,
+               st->prefilter_period, st->prefilter_period, offset, -st->prefilter_gain, -st->prefilter_gain,
+               st->prefilter_tapset, st->prefilter_tapset, NULL, 0);
+
+      comb_filter(in+c*(N+st->overlap)+st->overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset,
+            st->prefilter_period, pitch_index, N-offset, -st->prefilter_gain, -gain1,
+            st->prefilter_tapset, prefilter_tapset, mode->window, st->overlap);
+      OPUS_COPY(st->in_mem+c*(st->overlap), in+c*(N+st->overlap)+N, st->overlap);
+
+      if (N>COMBFILTER_MAXPERIOD)
+      {
+         OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD, pre[c]+N, COMBFILTER_MAXPERIOD);
+      } else {
+         OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD, prefilter_mem+c*COMBFILTER_MAXPERIOD+N, COMBFILTER_MAXPERIOD-N);
+         OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD+COMBFILTER_MAXPERIOD-N, pre[c]+COMBFILTER_MAXPERIOD, N);
+      }
+   } while (++c<CC);
+
+   RESTORE_STACK;
+   *gain = gain1;
+   *pitch = pitch_index;
+   *qgain = qg;
+   return pf_on;
+}
+
+static int compute_vbr(const CELTMode *mode, AnalysisInfo *analysis, opus_int32 base_target,
+      int LM, opus_int32 bitrate, int lastCodedBands, int C, int intensity,
+      int constrained_vbr, opus_val16 stereo_saving, int tot_boost,
+      opus_val16 tf_estimate, int pitch_change, opus_val16 maxDepth,
+      int variable_duration, int lfe, int has_surround_mask, opus_val16 surround_masking,
+      opus_val16 temporal_vbr)
+{
+   /* The target rate in 8th bits per frame */
+   opus_int32 target;
+   int coded_bins;
+   int coded_bands;
+   opus_val16 tf_calibration;
+   int nbEBands;
+   const opus_int16 *eBands;
+
+   nbEBands = mode->nbEBands;
+   eBands = mode->eBands;
+
+   coded_bands = lastCodedBands ? lastCodedBands : nbEBands;
+   coded_bins = eBands[coded_bands]<<LM;
+   if (C==2)
+      coded_bins += eBands[IMIN(intensity, coded_bands)]<<LM;
+
+   target = base_target;
+
+   /*printf("%f %f %f %f %d %d ", st->analysis.activity, st->analysis.tonality, tf_estimate, st->stereo_saving, tot_boost, coded_bands);*/
+#ifndef DISABLE_FLOAT_API
+   if (analysis->valid && analysis->activity<.4)
+      target -= (opus_int32)((coded_bins<<BITRES)*(.4f-analysis->activity));
+#endif
+   /* Stereo savings */
+   if (C==2)
+   {
+      int coded_stereo_bands;
+      int coded_stereo_dof;
+      opus_val16 max_frac;
+      coded_stereo_bands = IMIN(intensity, coded_bands);
+      coded_stereo_dof = (eBands[coded_stereo_bands]<<LM)-coded_stereo_bands;
+      /* Maximum fraction of the bits we can save if the signal is mono. */
+      max_frac = DIV32_16(MULT16_16(QCONST16(0.8f, 15), coded_stereo_dof), coded_bins);
+      /*printf("%d %d %d ", coded_stereo_dof, coded_bins, tot_boost);*/
+      target -= (opus_int32)MIN32(MULT16_32_Q15(max_frac,target),
+                      SHR32(MULT16_16(stereo_saving-QCONST16(0.1f,8),(coded_stereo_dof<<BITRES)),8));
+   }
+   /* Boost the rate according to dynalloc (minus the dynalloc average for calibration). */
+   target += tot_boost-(16<<LM);
+   /* Apply transient boost, compensating for average boost. */
+   tf_calibration = variable_duration==OPUS_FRAMESIZE_VARIABLE ?
+                    QCONST16(0.02f,14) : QCONST16(0.04f,14);
+   target += (opus_int32)SHL32(MULT16_32_Q15(tf_estimate-tf_calibration, target),1);
+
+#ifndef DISABLE_FLOAT_API
+   /* Apply tonality boost */
+   if (analysis->valid && !lfe)
+   {
+      opus_int32 tonal_target;
+      float tonal;
+
+      /* Tonality boost (compensating for the average). */
+      tonal = MAX16(0.f,analysis->tonality-.15f)-0.09f;
+      tonal_target = target + (opus_int32)((coded_bins<<BITRES)*1.2f*tonal);
+      if (pitch_change)
+         tonal_target +=  (opus_int32)((coded_bins<<BITRES)*.8f);
+      /*printf("%f %f ", analysis->tonality, tonal);*/
+      target = tonal_target;
+   }
+#endif
+
+   if (has_surround_mask&&!lfe)
+   {
+      opus_int32 surround_target = target + (opus_int32)SHR32(MULT16_16(surround_masking,coded_bins<<BITRES), DB_SHIFT);
+      /*printf("%f %d %d %d %d %d %d ", surround_masking, coded_bins, st->end, st->intensity, surround_target, target, st->bitrate);*/
+      target = IMAX(target/4, surround_target);
+   }
+
+   {
+      opus_int32 floor_depth;
+      int bins;
+      bins = eBands[nbEBands-2]<<LM;
+      /*floor_depth = SHR32(MULT16_16((C*bins<<BITRES),celt_log2(SHL32(MAX16(1,sample_max),13))), DB_SHIFT);*/
+      floor_depth = (opus_int32)SHR32(MULT16_16((C*bins<<BITRES),maxDepth), DB_SHIFT);
+      floor_depth = IMAX(floor_depth, target>>2);
+      target = IMIN(target, floor_depth);
+      /*printf("%f %d\n", maxDepth, floor_depth);*/
+   }
+
+   if ((!has_surround_mask||lfe) && (constrained_vbr || bitrate<64000))
+   {
+      opus_val16 rate_factor;
+#ifdef FIXED_POINT
+      rate_factor = MAX16(0,(bitrate-32000));
+#else
+      rate_factor = MAX16(0,(1.f/32768)*(bitrate-32000));
+#endif
+      if (constrained_vbr)
+         rate_factor = MIN16(rate_factor, QCONST16(0.67f, 15));
+      target = base_target + (opus_int32)MULT16_32_Q15(rate_factor, target-base_target);
+
+   }
+
+   if (!has_surround_mask && tf_estimate < QCONST16(.2f, 14))
+   {
+      opus_val16 amount;
+      opus_val16 tvbr_factor;
+      amount = MULT16_16_Q15(QCONST16(.0000031f, 30), IMAX(0, IMIN(32000, 96000-bitrate)));
+      tvbr_factor = SHR32(MULT16_16(temporal_vbr, amount), DB_SHIFT);
+      target += (opus_int32)MULT16_32_Q15(tvbr_factor, target);
+   }
+
+   /* Don't allow more than doubling the rate */
+   target = IMIN(2*base_target, target);
+
+   return target;
+}
+
+int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc)
+{
+   int i, c, N;
+   opus_int32 bits;
+   ec_enc _enc;
+   VARDECL(celt_sig, in);
+   VARDECL(celt_sig, freq);
+   VARDECL(celt_norm, X);
+   VARDECL(celt_ener, bandE);
+   VARDECL(opus_val16, bandLogE);
+   VARDECL(opus_val16, bandLogE2);
+   VARDECL(int, fine_quant);
+   VARDECL(opus_val16, error);
+   VARDECL(int, pulses);
+   VARDECL(int, cap);
+   VARDECL(int, offsets);
+   VARDECL(int, fine_priority);
+   VARDECL(int, tf_res);
+   VARDECL(unsigned char, collapse_masks);
+   celt_sig *prefilter_mem;
+   opus_val16 *oldBandE, *oldLogE, *oldLogE2;
+   int shortBlocks=0;
+   int isTransient=0;
+   const int CC = st->channels;
+   const int C = st->stream_channels;
+   int LM, M;
+   int tf_select;
+   int nbFilledBytes, nbAvailableBytes;
+   int effEnd;
+   int codedBands;
+   int tf_sum;
+   int alloc_trim;
+   int pitch_index=COMBFILTER_MINPERIOD;
+   opus_val16 gain1 = 0;
+   int dual_stereo=0;
+   int effectiveBytes;
+   int dynalloc_logp;
+   opus_int32 vbr_rate;
+   opus_int32 total_bits;
+   opus_int32 total_boost;
+   opus_int32 balance;
+   opus_int32 tell;
+   int prefilter_tapset=0;
+   int pf_on;
+   int anti_collapse_rsv;
+   int anti_collapse_on=0;
+   int silence=0;
+   int tf_chan = 0;
+   opus_val16 tf_estimate;
+   int pitch_change=0;
+   opus_int32 tot_boost;
+   opus_val32 sample_max;
+   opus_val16 maxDepth;
+   const OpusCustomMode *mode;
+   int nbEBands;
+   int overlap;
+   const opus_int16 *eBands;
+   int secondMdct;
+   int signalBandwidth;
+   int transient_got_disabled=0;
+   opus_val16 surround_masking=0;
+   opus_val16 temporal_vbr=0;
+   opus_val16 surround_trim = 0;
+   VARDECL(opus_val16, surround_dynalloc);
+   ALLOC_STACK;
+
+   mode = st->mode;
+   nbEBands = mode->nbEBands;
+   overlap = mode->overlap;
+   eBands = mode->eBands;
+   tf_estimate = 0;
+   if (nbCompressedBytes<2 || pcm==NULL)
+     return OPUS_BAD_ARG;
+
+   frame_size *= st->upsample;
+   for (LM=0;LM<=mode->maxLM;LM++)
+      if (mode->shortMdctSize<<LM==frame_size)
+         break;
+   if (LM>mode->maxLM)
+      return OPUS_BAD_ARG;
+   M=1<<LM;
+   N = M*mode->shortMdctSize;
+
+   prefilter_mem = st->in_mem+CC*(st->overlap);
+   oldBandE = (opus_val16*)(st->in_mem+CC*(st->overlap+COMBFILTER_MAXPERIOD));
+   oldLogE = oldBandE + CC*nbEBands;
+   oldLogE2 = oldLogE + CC*nbEBands;
+
+   if (enc==NULL)
+   {
+      tell=1;
+      nbFilledBytes=0;
+   } else {
+      tell=ec_tell(enc);
+      nbFilledBytes=(tell+4)>>3;
+   }
+
+#ifdef CUSTOM_MODES
+   if (st->signalling && enc==NULL)
+   {
+      int tmp = (mode->effEBands-st->end)>>1;
+      st->end = IMAX(1, mode->effEBands-tmp);
+      compressed[0] = tmp<<5;
+      compressed[0] |= LM<<3;
+      compressed[0] |= (C==2)<<2;
+      /* Convert "standard mode" to Opus header */
+      if (mode->Fs==48000 && mode->shortMdctSize==120)
+      {
+         int c0 = toOpus(compressed[0]);
+         if (c0<0)
+            return OPUS_BAD_ARG;
+         compressed[0] = c0;
+      }
+      compressed++;
+      nbCompressedBytes--;
+   }
+#else
+   celt_assert(st->signalling==0);
+#endif
+
+   /* Can't produce more than 1275 output bytes */
+   nbCompressedBytes = IMIN(nbCompressedBytes,1275);
+   nbAvailableBytes = nbCompressedBytes - nbFilledBytes;
+
+   if (st->vbr && st->bitrate!=OPUS_BITRATE_MAX)
+   {
+      opus_int32 den=mode->Fs>>BITRES;
+      vbr_rate=(st->bitrate*frame_size+(den>>1))/den;
+#ifdef CUSTOM_MODES
+      if (st->signalling)
+         vbr_rate -= 8<<BITRES;
+#endif
+      effectiveBytes = vbr_rate>>(3+BITRES);
+   } else {
+      opus_int32 tmp;
+      vbr_rate = 0;
+      tmp = st->bitrate*frame_size;
+      if (tell>1)
+         tmp += tell;
+      if (st->bitrate!=OPUS_BITRATE_MAX)
+         nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes,
+               (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling));
+      effectiveBytes = nbCompressedBytes;
+   }
+
+   if (enc==NULL)
+   {
+      ec_enc_init(&_enc, compressed, nbCompressedBytes);
+      enc = &_enc;
+   }
+
+   if (vbr_rate>0)
+   {
+      /* Computes the max bit-rate allowed in VBR mode to avoid violating the
+          target rate and buffering.
+         We must do this up front so that bust-prevention logic triggers
+          correctly if we don't have enough bits. */
+      if (st->constrained_vbr)
+      {
+         opus_int32 vbr_bound;
+         opus_int32 max_allowed;
+         /* We could use any multiple of vbr_rate as bound (depending on the
+             delay).
+            This is clamped to ensure we use at least two bytes if the encoder
+             was entirely empty, but to allow 0 in hybrid mode. */
+         vbr_bound = vbr_rate;
+         max_allowed = IMIN(IMAX(tell==1?2:0,
+               (vbr_rate+vbr_bound-st->vbr_reservoir)>>(BITRES+3)),
+               nbAvailableBytes);
+         if(max_allowed < nbAvailableBytes)
+         {
+            nbCompressedBytes = nbFilledBytes+max_allowed;
+            nbAvailableBytes = max_allowed;
+            ec_enc_shrink(enc, nbCompressedBytes);
+         }
+      }
+   }
+   total_bits = nbCompressedBytes*8;
+
+   effEnd = st->end;
+   if (effEnd > mode->effEBands)
+      effEnd = mode->effEBands;
+
+   ALLOC(in, CC*(N+st->overlap), celt_sig);
+
+   sample_max=MAX32(st->overlap_max, celt_maxabs16(pcm, C*(N-overlap)/st->upsample));
+   st->overlap_max=celt_maxabs16(pcm+C*(N-overlap)/st->upsample, C*overlap/st->upsample);
+   sample_max=MAX32(sample_max, st->overlap_max);
+#ifdef FIXED_POINT
+   silence = (sample_max==0);
+#else
+   silence = (sample_max <= (opus_val16)1/(1<<st->lsb_depth));
+#endif
+#ifdef FUZZING
+   if ((rand()&0x3F)==0)
+      silence = 1;
+#endif
+   if (tell==1)
+      ec_enc_bit_logp(enc, silence, 15);
+   else
+      silence=0;
+   if (silence)
+   {
+      /*In VBR mode there is no need to send more than the minimum. */
+      if (vbr_rate>0)
+      {
+         effectiveBytes=nbCompressedBytes=IMIN(nbCompressedBytes, nbFilledBytes+2);
+         total_bits=nbCompressedBytes*8;
+         nbAvailableBytes=2;
+         ec_enc_shrink(enc, nbCompressedBytes);
+      }
+      /* Pretend we've filled all the remaining bits with zeros
+            (that's what the initialiser did anyway) */
+      tell = nbCompressedBytes*8;
+      enc->nbits_total+=tell-ec_tell(enc);
+   }
+   c=0; do {
+      preemphasis(pcm+c, in+c*(N+st->overlap)+st->overlap, N, CC, st->upsample,
+                  mode->preemph, st->preemph_memE+c, st->clip);
+   } while (++c<CC);
+
+
+
+   /* Find pitch period and gain */
+   {
+      int enabled;
+      int qg;
+      enabled = (st->lfe || nbAvailableBytes>12*C) && st->start==0 && !silence && !st->disable_pf
+            && st->complexity >= 5 && !(st->consec_transient && LM!=3 && st->variable_duration==OPUS_FRAMESIZE_VARIABLE);
+
+      prefilter_tapset = st->tapset_decision;
+      pf_on = run_prefilter(st, in, prefilter_mem, CC, N, prefilter_tapset, &pitch_index, &gain1, &qg, enabled, nbAvailableBytes);
+      if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && (!st->analysis.valid || st->analysis.tonality > .3)
+            && (pitch_index > 1.26*st->prefilter_period || pitch_index < .79*st->prefilter_period))
+         pitch_change = 1;
+      if (pf_on==0)
+      {
+         if(st->start==0 && tell+16<=total_bits)
+            ec_enc_bit_logp(enc, 0, 1);
+      } else {
+         /*This block is not gated by a total bits check only because
+           of the nbAvailableBytes check above.*/
+         int octave;
+         ec_enc_bit_logp(enc, 1, 1);
+         pitch_index += 1;
+         octave = EC_ILOG(pitch_index)-5;
+         ec_enc_uint(enc, octave, 6);
+         ec_enc_bits(enc, pitch_index-(16<<octave), 4+octave);
+         pitch_index -= 1;
+         ec_enc_bits(enc, qg, 3);
+         ec_enc_icdf(enc, prefilter_tapset, tapset_icdf, 2);
+      }
+   }
+
+   isTransient = 0;
+   shortBlocks = 0;
+   if (st->complexity >= 1 && !st->lfe)
+   {
+      isTransient = transient_analysis(in, N+st->overlap, CC,
+            &tf_estimate, &tf_chan);
+   }
+   if (LM>0 && ec_tell(enc)+3<=total_bits)
+   {
+      if (isTransient)
+         shortBlocks = M;
+   } else {
+      isTransient = 0;
+      transient_got_disabled=1;
+   }
+
+   ALLOC(freq, CC*N, celt_sig); /**< Interleaved signal MDCTs */
+   ALLOC(bandE,nbEBands*CC, celt_ener);
+   ALLOC(bandLogE,nbEBands*CC, opus_val16);
+
+   secondMdct = shortBlocks && st->complexity>=8;
+   ALLOC(bandLogE2, C*nbEBands, opus_val16);
+   if (secondMdct)
+   {
+      compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample);
+      compute_band_energies(mode, freq, bandE, effEnd, C, M);
+      amp2Log2(mode, effEnd, st->end, bandE, bandLogE2, C);
+      for (i=0;i<C*nbEBands;i++)
+         bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+   }
+
+   compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample);
+   if (CC==2&&C==1)
+      tf_chan = 0;
+   compute_band_energies(mode, freq, bandE, effEnd, C, M);
+
+   if (st->lfe)
+   {
+      for (i=2;i<st->end;i++)
+      {
+         bandE[i] = IMIN(bandE[i], MULT16_32_Q15(QCONST16(1e-4f,15),bandE[0]));
+         bandE[i] = MAX32(bandE[i], EPSILON);
+      }
+   }
+   amp2Log2(mode, effEnd, st->end, bandE, bandLogE, C);
+
+   ALLOC(surround_dynalloc, C*nbEBands, opus_val16);
+   for(i=0;i<st->end;i++)
+      surround_dynalloc[i] = 0;
+   /* This computes how much masking takes place between surround channels */
+   if (st->start==0&&st->energy_mask&&!st->lfe)
+   {
+      int mask_end;
+      int midband;
+      int count_dynalloc;
+      opus_val32 mask_avg=0;
+      opus_val32 diff=0;
+      int count=0;
+      mask_end = IMAX(2,st->lastCodedBands);
+      for (c=0;c<C;c++)
+      {
+         for(i=0;i<mask_end;i++)
+         {
+            opus_val16 mask;
+            mask = MAX16(MIN16(st->energy_mask[nbEBands*c+i],
+                   QCONST16(.25f, DB_SHIFT)), -QCONST16(2.0f, DB_SHIFT));
+            if (mask > 0)
+               mask = HALF16(mask);
+            mask_avg += MULT16_16(mask, eBands[i+1]-eBands[i]);
+            count += eBands[i+1]-eBands[i];
+            diff += MULT16_16(mask, 1+2*i-mask_end);
+         }
+      }
+      mask_avg = DIV32_16(mask_avg,count);
+      mask_avg += QCONST16(.2f, DB_SHIFT);
+      diff = diff*6/(C*(mask_end-1)*(mask_end+1)*mask_end);
+      /* Again, being conservative */
+      diff = HALF32(diff);
+      diff = MAX32(MIN32(diff, QCONST32(.031f, DB_SHIFT)), -QCONST32(.031f, DB_SHIFT));
+      /* Find the band that's in the middle of the coded spectrum */
+      for (midband=0;eBands[midband+1] < eBands[mask_end]/2;midband++);
+      count_dynalloc=0;
+      for(i=0;i<mask_end;i++)
+      {
+         opus_val32 lin;
+         opus_val16 unmask;
+         lin = mask_avg + diff*(i-midband);
+         if (C==2)
+            unmask = MAX16(st->energy_mask[i], st->energy_mask[nbEBands+i]);
+         else
+            unmask = st->energy_mask[i];
+         unmask = MIN16(unmask, QCONST16(.0f, DB_SHIFT));
+         unmask -= lin;
+         if (unmask > QCONST16(.25f, DB_SHIFT))
+         {
+            surround_dynalloc[i] = unmask - QCONST16(.25f, DB_SHIFT);
+            count_dynalloc++;
+         }
+      }
+      if (count_dynalloc>=3)
+      {
+         /* If we need dynalloc in many bands, it's probably because our
+            initial masking rate was too low. */
+         mask_avg += QCONST16(.25f, DB_SHIFT);
+         if (mask_avg>0)
+         {
+            /* Something went really wrong in the original calculations,
+               disabling masking. */
+            mask_avg = 0;
+            diff = 0;
+            for(i=0;i<mask_end;i++)
+               surround_dynalloc[i] = 0;
+         } else {
+            for(i=0;i<mask_end;i++)
+               surround_dynalloc[i] = MAX16(0, surround_dynalloc[i]-QCONST16(.25f, DB_SHIFT));
+         }
+      }
+      mask_avg += QCONST16(.2f, DB_SHIFT);
+      /* Convert to 1/64th units used for the trim */
+      surround_trim = 64*diff;
+      /*printf("%d %d ", mask_avg, surround_trim);*/
+      surround_masking = mask_avg;
+   }
+   /* Temporal VBR (but not for LFE) */
+   if (!st->lfe)
+   {
+      opus_val16 follow=-QCONST16(10.0f,DB_SHIFT);
+      float frame_avg=0;
+      opus_val16 offset = shortBlocks?HALF16(SHL16(LM, DB_SHIFT)):0;
+      for(i=st->start;i<st->end;i++)
+      {
+         follow = MAX16(follow-QCONST16(1.f, DB_SHIFT), bandLogE[i]-offset);
+         if (C==2)
+            follow = MAX16(follow, bandLogE[i+nbEBands]-offset);
+         frame_avg += follow;
+      }
+      frame_avg /= (st->end-st->start);
+      temporal_vbr = SUB16(frame_avg,st->spec_avg);
+      temporal_vbr = MIN16(QCONST16(3.f, DB_SHIFT), MAX16(-QCONST16(1.5f, DB_SHIFT), temporal_vbr));
+      st->spec_avg += MULT16_16_Q15(QCONST16(.02f, 15), temporal_vbr);
+   }
+   /*for (i=0;i<21;i++)
+      printf("%f ", bandLogE[i]);
+   printf("\n");*/
+
+   if (!secondMdct)
+   {
+      for (i=0;i<C*nbEBands;i++)
+         bandLogE2[i] = bandLogE[i];
+   }
+
+   /* Last chance to catch any transient we might have missed in the
+      time-domain analysis */
+   if (LM>0 && ec_tell(enc)+3<=total_bits && !isTransient && st->complexity>=5 && !st->lfe)
+   {
+      if (patch_transient_decision(bandLogE, oldBandE, nbEBands, st->end, C))
+      {
+         isTransient = 1;
+         shortBlocks = M;
+         compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample);
+         compute_band_energies(mode, freq, bandE, effEnd, C, M);
+         amp2Log2(mode, effEnd, st->end, bandE, bandLogE, C);
+         /* Compensate for the scaling of short vs long mdcts */
+         for (i=0;i<C*nbEBands;i++)
+            bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+         tf_estimate = QCONST16(.2f,14);
+      }
+   }
+
+   if (LM>0 && ec_tell(enc)+3<=total_bits)
+      ec_enc_bit_logp(enc, isTransient, 3);
+
+   ALLOC(X, C*N, celt_norm);         /**< Interleaved normalised MDCTs */
+
+   /* Band normalisation */
+   normalise_bands(mode, freq, X, bandE, effEnd, C, M);
+
+   ALLOC(tf_res, nbEBands, int);
+   /* Disable variable tf resolution for hybrid and at very low bitrate */
+   if (effectiveBytes>=15*C && st->start==0 && st->complexity>=2 && !st->lfe)
+   {
+      int lambda;
+      if (effectiveBytes<40)
+         lambda = 12;
+      else if (effectiveBytes<60)
+         lambda = 6;
+      else if (effectiveBytes<100)
+         lambda = 4;
+      else
+         lambda = 3;
+      lambda*=2;
+      tf_select = tf_analysis(mode, effEnd, isTransient, tf_res, lambda, X, N, LM, &tf_sum, tf_estimate, tf_chan);
+      for (i=effEnd;i<st->end;i++)
+         tf_res[i] = tf_res[effEnd-1];
+   } else {
+      tf_sum = 0;
+      for (i=0;i<st->end;i++)
+         tf_res[i] = isTransient;
+      tf_select=0;
+   }
+
+   ALLOC(error, C*nbEBands, opus_val16);
+   quant_coarse_energy(mode, st->start, st->end, effEnd, bandLogE,
+         oldBandE, total_bits, error, enc,
+         C, LM, nbAvailableBytes, st->force_intra,
+         &st->delayedIntra, st->complexity >= 4, st->loss_rate, st->lfe);
+
+   tf_encode(st->start, st->end, isTransient, tf_res, LM, tf_select, enc);
+
+   if (ec_tell(enc)+4<=total_bits)
+   {
+      if (st->lfe)
+      {
+         st->tapset_decision = 0;
+         st->spread_decision = SPREAD_NORMAL;
+      } else if (shortBlocks || st->complexity < 3 || nbAvailableBytes < 10*C || st->start != 0)
+      {
+         if (st->complexity == 0)
+            st->spread_decision = SPREAD_NONE;
+         else
+            st->spread_decision = SPREAD_NORMAL;
+      } else {
+         /* Disable new spreading+tapset estimator until we can show it works
+            better than the old one. So far it seems like spreading_decision()
+            works best. */
+         if (0&&st->analysis.valid)
+         {
+            static const opus_val16 spread_thresholds[3] = {-QCONST16(.6f, 15), -QCONST16(.2f, 15), -QCONST16(.07f, 15)};
+            static const opus_val16 spread_histeresis[3] = {QCONST16(.15f, 15), QCONST16(.07f, 15), QCONST16(.02f, 15)};
+            static const opus_val16 tapset_thresholds[2] = {QCONST16(.0f, 15), QCONST16(.15f, 15)};
+            static const opus_val16 tapset_histeresis[2] = {QCONST16(.1f, 15), QCONST16(.05f, 15)};
+            st->spread_decision = hysteresis_decision(-st->analysis.tonality, spread_thresholds, spread_histeresis, 3, st->spread_decision);
+            st->tapset_decision = hysteresis_decision(st->analysis.tonality_slope, tapset_thresholds, tapset_histeresis, 2, st->tapset_decision);
+         } else {
+            st->spread_decision = spreading_decision(mode, X,
+                  &st->tonal_average, st->spread_decision, &st->hf_average,
+                  &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M);
+         }
+         /*printf("%d %d\n", st->tapset_decision, st->spread_decision);*/
+         /*printf("%f %d %f %d\n\n", st->analysis.tonality, st->spread_decision, st->analysis.tonality_slope, st->tapset_decision);*/
+      }
+      ec_enc_icdf(enc, st->spread_decision, spread_icdf, 5);
+   }
+
+   ALLOC(offsets, nbEBands, int);
+
+   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, nbEBands, st->start, st->end, C, offsets,
+         st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr,
+         eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc);
+   /* For LFE, everything interesting is in the first band */
+   if (st->lfe)
+      offsets[0] = IMIN(8, effectiveBytes/3);
+   ALLOC(cap, nbEBands, int);
+   init_caps(mode,cap,LM,C);
+
+   dynalloc_logp = 6;
+   total_bits<<=BITRES;
+   total_boost = 0;
+   tell = ec_tell_frac(enc);
+   for (i=st->start;i<st->end;i++)
+   {
+      int width, quanta;
+      int dynalloc_loop_logp;
+      int boost;
+      int j;
+      width = C*(eBands[i+1]-eBands[i])<<LM;
+      /* quanta is 6 bits, but no more than 1 bit/sample
+         and no less than 1/8 bit/sample */
+      quanta = IMIN(width<<BITRES, IMAX(6<<BITRES, width));
+      dynalloc_loop_logp = dynalloc_logp;
+      boost = 0;
+      for (j = 0; tell+(dynalloc_loop_logp<<BITRES) < total_bits-total_boost
+            && boost < cap[i]; j++)
+      {
+         int flag;
+         flag = j<offsets[i];
+         ec_enc_bit_logp(enc, flag, dynalloc_loop_logp);
+         tell = ec_tell_frac(enc);
+         if (!flag)
+            break;
+         boost += quanta;
+         total_boost += quanta;
+         dynalloc_loop_logp = 1;
+      }
+      /* Making dynalloc more likely */
+      if (j)
+         dynalloc_logp = IMAX(2, dynalloc_logp-1);
+      offsets[i] = boost;
+   }
+
+   if (C==2)
+   {
+      int effectiveRate;
+
+      static const opus_val16 intensity_thresholds[21]=
+      /* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19  20  off*/
+        { 16,21,23,25,27,29,31,33,35,38,42,46,50,54,58,63,68,75,84,102,130};
+      static const opus_val16 intensity_histeresis[21]=
+        {  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 5, 6,  8, 12};
+
+      /* Always use MS for 2.5 ms frames until we can do a better analysis */
+      if (LM!=0)
+         dual_stereo = stereo_analysis(mode, X, LM, N);
+
+      /* Account for coarse energy */
+      effectiveRate = (8*effectiveBytes - 80)>>LM;
+
+      /* effectiveRate in kb/s */
+      effectiveRate = 2*effectiveRate/5;
+
+      st->intensity = hysteresis_decision((opus_val16)effectiveRate, intensity_thresholds, intensity_histeresis, 21, st->intensity);
+      st->intensity = IMIN(st->end,IMAX(st->start, st->intensity));
+   }
+
+   alloc_trim = 5;
+   if (tell+(6<<BITRES) <= total_bits - total_boost)
+   {
+      if (st->lfe)
+         alloc_trim = 5;
+      else
+         alloc_trim = alloc_trim_analysis(mode, X, bandLogE,
+            st->end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate, st->intensity, surround_trim);
+      ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
+      tell = ec_tell_frac(enc);
+   }
+
+   /* Variable bitrate */
+   if (vbr_rate>0)
+   {
+     opus_val16 alpha;
+     opus_int32 delta;
+     /* The target rate in 8th bits per frame */
+     opus_int32 target, base_target;
+     opus_int32 min_allowed;
+     int lm_diff = mode->maxLM - LM;
+
+     /* Don't attempt to use more than 510 kb/s, even for frames smaller than 20 ms.
+        The CELT allocator will just not be able to use more than that anyway. */
+     nbCompressedBytes = IMIN(nbCompressedBytes,1275>>(3-LM));
+     base_target = vbr_rate - ((40*C+20)<<BITRES);
+
+     if (st->constrained_vbr)
+        base_target += (st->vbr_offset>>lm_diff);
+
+     target = compute_vbr(mode, &st->analysis, base_target, LM, st->bitrate,
+           st->lastCodedBands, C, st->intensity, st->constrained_vbr,
+           st->stereo_saving, tot_boost, tf_estimate, pitch_change, maxDepth,
+           st->variable_duration, st->lfe, st->energy_mask!=NULL, surround_masking,
+           temporal_vbr);
+
+     /* The current offset is removed from the target and the space used
+        so far is added*/
+     target=target+tell;
+     /* In VBR mode the frame size must not be reduced so much that it would
+         result in the encoder running out of bits.
+        The margin of 2 bytes ensures that none of the bust-prevention logic
+         in the decoder will have triggered so far. */
+     min_allowed = ((tell+total_boost+(1<<(BITRES+3))-1)>>(BITRES+3)) + 2 - nbFilledBytes;
+
+     nbAvailableBytes = (target+(1<<(BITRES+2)))>>(BITRES+3);
+     nbAvailableBytes = IMAX(min_allowed,nbAvailableBytes);
+     nbAvailableBytes = IMIN(nbCompressedBytes,nbAvailableBytes+nbFilledBytes) - nbFilledBytes;
+
+     /* By how much did we "miss" the target on that frame */
+     delta = target - vbr_rate;
+
+     target=nbAvailableBytes<<(BITRES+3);
+
+     /*If the frame is silent we don't adjust our drift, otherwise
+       the encoder will shoot to very high rates after hitting a
+       span of silence, but we do allow the bitres to refill.
+       This means that we'll undershoot our target in CVBR/VBR modes
+       on files with lots of silence. */
+     if(silence)
+     {
+       nbAvailableBytes = 2;
+       target = 2*8<<BITRES;
+       delta = 0;
+     }
+
+     if (st->vbr_count < 970)
+     {
+        st->vbr_count++;
+        alpha = celt_rcp(SHL32(EXTEND32(st->vbr_count+20),16));
+     } else
+        alpha = QCONST16(.001f,15);
+     /* How many bits have we used in excess of what we're allowed */
+     if (st->constrained_vbr)
+        st->vbr_reservoir += target - vbr_rate;
+     /*printf ("%d\n", st->vbr_reservoir);*/
+
+     /* Compute the offset we need to apply in order to reach the target */
+     if (st->constrained_vbr)
+     {
+        st->vbr_drift += (opus_int32)MULT16_32_Q15(alpha,(delta*(1<<lm_diff))-st->vbr_offset-st->vbr_drift);
+        st->vbr_offset = -st->vbr_drift;
+     }
+     /*printf ("%d\n", st->vbr_drift);*/
+
+     if (st->constrained_vbr && st->vbr_reservoir < 0)
+     {
+        /* We're under the min value -- increase rate */
+        int adjust = (-st->vbr_reservoir)/(8<<BITRES);
+        /* Unless we're just coding silence */
+        nbAvailableBytes += silence?0:adjust;
+        st->vbr_reservoir = 0;
+        /*printf ("+%d\n", adjust);*/
+     }
+     nbCompressedBytes = IMIN(nbCompressedBytes,nbAvailableBytes+nbFilledBytes);
+     /*printf("%d\n", nbCompressedBytes*50*8);*/
+     /* This moves the raw bits to take into account the new compressed size */
+     ec_enc_shrink(enc, nbCompressedBytes);
+   }
+
+   /* Bit allocation */
+   ALLOC(fine_quant, nbEBands, int);
+   ALLOC(pulses, nbEBands, int);
+   ALLOC(fine_priority, nbEBands, int);
+
+   /* bits =           packet size                    - where we are - safety*/
+   bits = (((opus_int32)nbCompressedBytes*8)<<BITRES) - ec_tell_frac(enc) - 1;
+   anti_collapse_rsv = isTransient&&LM>=2&&bits>=((LM+2)<<BITRES) ? (1<<BITRES) : 0;
+   bits -= anti_collapse_rsv;
+   signalBandwidth = st->end-1;
+#ifndef DISABLE_FLOAT_API
+   if (st->analysis.valid)
+   {
+      int min_bandwidth;
+      if (st->bitrate < (opus_int32)32000*C)
+         min_bandwidth = 13;
+      else if (st->bitrate < (opus_int32)48000*C)
+         min_bandwidth = 16;
+      else if (st->bitrate < (opus_int32)60000*C)
+         min_bandwidth = 18;
+      else  if (st->bitrate < (opus_int32)80000*C)
+         min_bandwidth = 19;
+      else
+         min_bandwidth = 20;
+      signalBandwidth = IMAX(st->analysis.bandwidth, min_bandwidth);
+   }
+#endif
+   if (st->lfe)
+      signalBandwidth = 1;
+   codedBands = compute_allocation(mode, st->start, st->end, offsets, cap,
+         alloc_trim, &st->intensity, &dual_stereo, bits, &balance, pulses,
+         fine_quant, fine_priority, C, LM, enc, 1, st->lastCodedBands, signalBandwidth);
+   if (st->lastCodedBands)
+      st->lastCodedBands = IMIN(st->lastCodedBands+1,IMAX(st->lastCodedBands-1,codedBands));
+   else
+      st->lastCodedBands = codedBands;
+
+   quant_fine_energy(mode, st->start, st->end, oldBandE, error, fine_quant, enc, C);
+
+   /* Residual quantisation */
+   ALLOC(collapse_masks, C*nbEBands, unsigned char);
+   quant_all_bands(1, mode, st->start, st->end, X, C==2 ? X+N : NULL, collapse_masks,
+         bandE, pulses, shortBlocks, st->spread_decision, dual_stereo, st->intensity, tf_res,
+         nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv, balance, enc, LM, codedBands, &st->rng);
+
+   if (anti_collapse_rsv > 0)
+   {
+      anti_collapse_on = st->consec_transient<2;
+#ifdef FUZZING
+      anti_collapse_on = rand()&0x1;
+#endif
+      ec_enc_bits(enc, anti_collapse_on, 1);
+   }
+   quant_energy_finalise(mode, st->start, st->end, oldBandE, error, fine_quant, fine_priority, nbCompressedBytes*8-ec_tell(enc), enc, C);
+
+   if (silence)
+   {
+      for (i=0;i<C*nbEBands;i++)
+         oldBandE[i] = -QCONST16(28.f,DB_SHIFT);
+   }
+
+#ifdef RESYNTH
+   /* Re-synthesis of the coded audio if required */
+   {
+      celt_sig *out_mem[2];
+
+      if (anti_collapse_on)
+      {
+         anti_collapse(mode, X, collapse_masks, LM, C, N,
+               st->start, st->end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
+      }
+
+      if (silence)
+      {
+         for (i=0;i<C*N;i++)
+            freq[i] = 0;
+      } else {
+         /* Synthesis */
+         denormalise_bands(mode, X, freq, oldBandE, st->start, effEnd, C, M);
+      }
+
+      c=0; do {
+         OPUS_MOVE(st->syn_mem[c], st->syn_mem[c]+N, 2*MAX_PERIOD-N+overlap/2);
+      } while (++c<CC);
+
+      if (CC==2&&C==1)
+      {
+         for (i=0;i<N;i++)
+            freq[N+i] = freq[i];
+      }
+
+      c=0; do {
+         out_mem[c] = st->syn_mem[c]+2*MAX_PERIOD-N;
+      } while (++c<CC);
+
+      compute_inv_mdcts(mode, shortBlocks, freq, out_mem, CC, LM);
+
+      c=0; do {
+         st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD);
+         st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD);
+         comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize,
+               st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset,
+               mode->window, st->overlap);
+         if (LM!=0)
+            comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize,
+                  st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset,
+                  mode->window, overlap);
+      } while (++c<CC);
+
+      /* We reuse freq[] as scratch space for the de-emphasis */
+      deemphasis(out_mem, (opus_val16*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD, freq);
+      st->prefilter_period_old = st->prefilter_period;
+      st->prefilter_gain_old = st->prefilter_gain;
+      st->prefilter_tapset_old = st->prefilter_tapset;
+   }
+#endif
+
+   st->prefilter_period = pitch_index;
+   st->prefilter_gain = gain1;
+   st->prefilter_tapset = prefilter_tapset;
+#ifdef RESYNTH
+   if (LM!=0)
+   {
+      st->prefilter_period_old = st->prefilter_period;
+      st->prefilter_gain_old = st->prefilter_gain;
+      st->prefilter_tapset_old = st->prefilter_tapset;
+   }
+#endif
+
+   if (CC==2&&C==1) {
+      for (i=0;i<nbEBands;i++)
+         oldBandE[nbEBands+i]=oldBandE[i];
+   }
+
+   if (!isTransient)
+   {
+      for (i=0;i<CC*nbEBands;i++)
+         oldLogE2[i] = oldLogE[i];
+      for (i=0;i<CC*nbEBands;i++)
+         oldLogE[i] = oldBandE[i];
+   } else {
+      for (i=0;i<CC*nbEBands;i++)
+         oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
+   }
+   /* In case start or end were to change */
+   c=0; do
+   {
+      for (i=0;i<st->start;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
+      }
+      for (i=st->end;i<nbEBands;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-QCONST16(28.f,DB_SHIFT);
+      }
+   } while (++c<CC);
+
+   if (isTransient || transient_got_disabled)
+      st->consec_transient++;
+   else
+      st->consec_transient=0;
+   st->rng = enc->rng;
+
+   /* If there's any room left (can only happen for very high rates),
+      it's already filled with zeros */
+   ec_enc_done(enc);
+
+#ifdef CUSTOM_MODES
+   if (st->signalling)
+      nbCompressedBytes++;
+#endif
+
+   RESTORE_STACK;
+   if (ec_get_error(enc))
+      return OPUS_INTERNAL_ERROR;
+   else
+      return nbCompressedBytes;
+}
+
+
+#ifdef CUSTOM_MODES
+
+#ifdef FIXED_POINT
+int opus_custom_encode(CELTEncoder * OPUS_RESTRICT st, const opus_int16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   return celt_encode_with_ec(st, pcm, frame_size, compressed, nbCompressedBytes, NULL);
+}
+
+#ifndef DISABLE_FLOAT_API
+int opus_custom_encode_float(CELTEncoder * OPUS_RESTRICT st, const float * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   int j, ret, C, N;
+   VARDECL(opus_int16, in);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+   ALLOC(in, C*N, opus_int16);
+
+   for (j=0;j<C*N;j++)
+     in[j] = FLOAT2INT16(pcm[j]);
+
+   ret=celt_encode_with_ec(st,in,frame_size,compressed,nbCompressedBytes, NULL);
+#ifdef RESYNTH
+   for (j=0;j<C*N;j++)
+      ((float*)pcm)[j]=in[j]*(1.f/32768.f);
+#endif
+   RESTORE_STACK;
+   return ret;
+}
+#endif /* DISABLE_FLOAT_API */
+#else
+
+int opus_custom_encode(CELTEncoder * OPUS_RESTRICT st, const opus_int16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   int j, ret, C, N;
+   VARDECL(celt_sig, in);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C=st->channels;
+   N=frame_size;
+   ALLOC(in, C*N, celt_sig);
+   for (j=0;j<C*N;j++) {
+     in[j] = SCALEOUT(pcm[j]);
+   }
+
+   ret = celt_encode_with_ec(st,in,frame_size,compressed,nbCompressedBytes, NULL);
+#ifdef RESYNTH
+   for (j=0;j<C*N;j++)
+      ((opus_int16*)pcm)[j] = FLOAT2INT16(in[j]);
+#endif
+   RESTORE_STACK;
+   return ret;
+}
+
+int opus_custom_encode_float(CELTEncoder * OPUS_RESTRICT st, const float * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   return celt_encode_with_ec(st, pcm, frame_size, compressed, nbCompressedBytes, NULL);
+}
+
+#endif
+
+#endif /* CUSTOM_MODES */
+
+int opus_custom_encoder_ctl(CELTEncoder * OPUS_RESTRICT st, int request, ...)
+{
+   va_list ap;
+
+   va_start(ap, request);
+   switch (request)
+   {
+      case OPUS_SET_COMPLEXITY_REQUEST:
+      {
+         int value = va_arg(ap, opus_int32);
+         if (value<0 || value>10)
+            goto bad_arg;
+         st->complexity = value;
+      }
+      break;
+      case CELT_SET_START_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<0 || value>=st->mode->nbEBands)
+            goto bad_arg;
+         st->start = value;
+      }
+      break;
+      case CELT_SET_END_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>st->mode->nbEBands)
+            goto bad_arg;
+         st->end = value;
+      }
+      break;
+      case CELT_SET_PREDICTION_REQUEST:
+      {
+         int value = va_arg(ap, opus_int32);
+         if (value<0 || value>2)
+            goto bad_arg;
+         st->disable_pf = value<=1;
+         st->force_intra = value==0;
+      }
+      break;
+      case OPUS_SET_PACKET_LOSS_PERC_REQUEST:
+      {
+         int value = va_arg(ap, opus_int32);
+         if (value<0 || value>100)
+            goto bad_arg;
+         st->loss_rate = value;
+      }
+      break;
+      case OPUS_SET_VBR_CONSTRAINT_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->constrained_vbr = value;
+      }
+      break;
+      case OPUS_SET_VBR_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->vbr = value;
+      }
+      break;
+      case OPUS_SET_BITRATE_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<=500 && value!=OPUS_BITRATE_MAX)
+            goto bad_arg;
+         value = IMIN(value, 260000*st->channels);
+         st->bitrate = value;
+      }
+      break;
+      case CELT_SET_CHANNELS_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>2)
+            goto bad_arg;
+         st->stream_channels = value;
+      }
+      break;
+      case OPUS_SET_LSB_DEPTH_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          if (value<8 || value>24)
+             goto bad_arg;
+          st->lsb_depth=value;
+      }
+      break;
+      case OPUS_GET_LSB_DEPTH_REQUEST:
+      {
+          opus_int32 *value = va_arg(ap, opus_int32*);
+          *value=st->lsb_depth;
+      }
+      break;
+      case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          st->variable_duration = value;
+      }
+      break;
+      case OPUS_RESET_STATE:
+      {
+         int i;
+         opus_val16 *oldBandE, *oldLogE, *oldLogE2;
+         oldBandE = (opus_val16*)(st->in_mem+st->channels*(st->overlap+COMBFILTER_MAXPERIOD));
+         oldLogE = oldBandE + st->channels*st->mode->nbEBands;
+         oldLogE2 = oldLogE + st->channels*st->mode->nbEBands;
+         OPUS_CLEAR((char*)&st->ENCODER_RESET_START,
+               opus_custom_encoder_get_size(st->mode, st->channels)-
+               ((char*)&st->ENCODER_RESET_START - (char*)st));
+         for (i=0;i<st->channels*st->mode->nbEBands;i++)
+            oldLogE[i]=oldLogE2[i]=-QCONST16(28.f,DB_SHIFT);
+         st->vbr_offset = 0;
+         st->delayedIntra = 1;
+         st->spread_decision = SPREAD_NORMAL;
+         st->tonal_average = 256;
+         st->hf_average = 0;
+         st->tapset_decision = 0;
+      }
+      break;
+#ifdef CUSTOM_MODES
+      case CELT_SET_INPUT_CLIPPING_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->clip = value;
+      }
+      break;
+#endif
+      case CELT_SET_SIGNALLING_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->signalling = value;
+      }
+      break;
+      case CELT_SET_ANALYSIS_REQUEST:
+      {
+         AnalysisInfo *info = va_arg(ap, AnalysisInfo *);
+         if (info)
+            OPUS_COPY(&st->analysis, info, 1);
+      }
+      break;
+      case CELT_GET_MODE_REQUEST:
+      {
+         const CELTMode ** value = va_arg(ap, const CELTMode**);
+         if (value==0)
+            goto bad_arg;
+         *value=st->mode;
+      }
+      break;
+      case OPUS_GET_FINAL_RANGE_REQUEST:
+      {
+         opus_uint32 * value = va_arg(ap, opus_uint32 *);
+         if (value==0)
+            goto bad_arg;
+         *value=st->rng;
+      }
+      break;
+      case OPUS_SET_LFE_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          st->lfe = value;
+      }
+      break;
+      case OPUS_SET_ENERGY_MASK_REQUEST:
+      {
+          opus_val16 *value = va_arg(ap, opus_val16*);
+          st->energy_mask = value;
+      }
+      break;
+      default:
+         goto bad_request;
+   }
+   va_end(ap);
+   return OPUS_OK;
+bad_arg:
+   va_end(ap);
+   return OPUS_BAD_ARG;
+bad_request:
+   va_end(ap);
+   return OPUS_UNIMPLEMENTED;
+}
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index d2addbf..7ffe90a 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -32,6 +32,7 @@
 #include "celt_lpc.h"
 #include "stack_alloc.h"
 #include "mathops.h"
+#include "pitch.h"
 
 void _celt_lpc(
       opus_val16       *_lpc, /* out: [0...p-1] LPC coefficients      */
@@ -87,42 +88,71 @@ int          p
 #endif
 }
 
-void celt_fir(const opus_val16 *x,
+void celt_fir(const opus_val16 *_x,
          const opus_val16 *num,
-         opus_val16 *y,
+         opus_val16 *_y,
          int N,
          int ord,
          opus_val16 *mem)
 {
    int i,j;
+   VARDECL(opus_val16, rnum);
+   VARDECL(opus_val16, x);
+   SAVE_STACK;
 
+   ALLOC(rnum, ord, opus_val16);
+   ALLOC(x, N+ord, opus_val16);
+   for(i=0;i<ord;i++)
+      rnum[i] = num[ord-i-1];
+   for(i=0;i<ord;i++)
+      x[i] = mem[ord-i-1];
+   for (i=0;i<N;i++)
+      x[i+ord]=_x[i];
+   for(i=0;i<ord;i++)
+      mem[i] = _x[N-i-1];
+#ifdef SMALL_FOOTPRINT
    for (i=0;i<N;i++)
    {
-      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
+      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
       {
-         sum += MULT16_16(num[j],mem[j]);
-      }
-      for (j=ord-1;j>=1;j--)
-      {
-         mem[j]=mem[j-1];
+         sum = MAC16_16(sum,rnum[j],x[i+j]);
       }
-      mem[0] = x[i];
-      y[i] = ROUND16(sum, SIG_SHIFT);
+      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
    }
+#else
+   for (i=0;i<N-3;i+=4)
+   {
+      opus_val32 sum[4]={0,0,0,0};
+      xcorr_kernel(rnum, x+i, sum, ord);
+      _y[i  ] = SATURATE16(ADD32(EXTEND32(_x[i  ]), PSHR32(sum[0], SIG_SHIFT)));
+      _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
+      _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
+      _y[i+3] = SATURATE16(ADD32(EXTEND32(_x[i+3]), PSHR32(sum[3], SIG_SHIFT)));
+   }
+   for (;i<N;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<ord;j++)
+         sum = MAC16_16(sum,rnum[j],x[i+j]);
+      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+   }
+#endif
+   RESTORE_STACK;
 }
 
-void celt_iir(const opus_val32 *x,
+void celt_iir(const opus_val32 *_x,
          const opus_val16 *den,
-         opus_val32 *y,
+         opus_val32 *_y,
          int N,
          int ord,
          opus_val16 *mem)
 {
+#ifdef SMALL_FOOTPRINT
    int i,j;
    for (i=0;i<N;i++)
    {
-      opus_val32 sum = x[i];
+      opus_val32 sum = _x[i];
       for (j=0;j<ord;j++)
       {
          sum -= MULT16_16(den[j],mem[j]);
@@ -132,11 +162,65 @@ void celt_iir(const opus_val32 *x,
          mem[j]=mem[j-1];
       }
       mem[0] = ROUND16(sum,SIG_SHIFT);
-      y[i] = sum;
+      _y[i] = sum;
    }
+#else
+   int i,j;
+   VARDECL(opus_val16, rden);
+   VARDECL(opus_val16, y);
+   SAVE_STACK;
+
+   celt_assert((ord&3)==0);
+   ALLOC(rden, ord, opus_val16);
+   ALLOC(y, N+ord, opus_val16);
+   for(i=0;i<ord;i++)
+      rden[i] = den[ord-i-1];
+   for(i=0;i<ord;i++)
+      y[i] = -mem[ord-i-1];
+   for(;i<N+ord;i++)
+      y[i]=0;
+   for (i=0;i<N-3;i+=4)
+   {
+      /* Unroll by 4 as if it were an FIR filter */
+      opus_val32 sum[4];
+      sum[0]=_x[i];
+      sum[1]=_x[i+1];
+      sum[2]=_x[i+2];
+      sum[3]=_x[i+3];
+      xcorr_kernel(rden, y+i, sum, ord);
+
+      /* Patch up the result to compensate for the fact that this is an IIR */
+      y[i+ord  ] = -ROUND16(sum[0],SIG_SHIFT);
+      _y[i  ] = sum[0];
+      sum[1] = MAC16_16(sum[1], y[i+ord  ], den[0]);
+      y[i+ord+1] = -ROUND16(sum[1],SIG_SHIFT);
+      _y[i+1] = sum[1];
+      sum[2] = MAC16_16(sum[2], y[i+ord+1], den[0]);
+      sum[2] = MAC16_16(sum[2], y[i+ord  ], den[1]);
+      y[i+ord+2] = -ROUND16(sum[2],SIG_SHIFT);
+      _y[i+2] = sum[2];
+
+      sum[3] = MAC16_16(sum[3], y[i+ord+2], den[0]);
+      sum[3] = MAC16_16(sum[3], y[i+ord+1], den[1]);
+      sum[3] = MAC16_16(sum[3], y[i+ord  ], den[2]);
+      y[i+ord+3] = -ROUND16(sum[3],SIG_SHIFT);
+      _y[i+3] = sum[3];
+   }
+   for (;i<N;i++)
+   {
+      opus_val32 sum = _x[i];
+      for (j=0;j<ord;j++)
+         sum -= MULT16_16(rden[j],y[i+j]);
+      y[i+ord] = ROUND16(sum,SIG_SHIFT);
+      _y[i] = sum;
+   }
+   for(i=0;i<ord;i++)
+      mem[i] = _y[N-i-1];
+   RESTORE_STACK;
+#endif
 }
 
-void _celt_autocorr(
+int _celt_autocorr(
                    const opus_val16 *x,   /*  in: [0...n-1] samples x   */
                    opus_val32       *ac,  /* out: [0...lag-1] ac values */
                    const opus_val16       *window,
@@ -146,43 +230,79 @@ void _celt_autocorr(
                   )
 {
    opus_val32 d;
-   int i;
+   int i, k;
+   int fastN=n-lag;
+   int shift;
+   const opus_val16 *xptr;
    VARDECL(opus_val16, xx);
    SAVE_STACK;
    ALLOC(xx, n, opus_val16);
    celt_assert(n>0);
    celt_assert(overlap>=0);
-   for (i=0;i<n;i++)
-      xx[i] = x[i];
-   for (i=0;i<overlap;i++)
+   if (overlap == 0)
    {
-      xx[i] = MULT16_16_Q15(x[i],window[i]);
-      xx[n-i-1] = MULT16_16_Q15(x[n-i-1],window[i]);
+      xptr = x;
+   } else {
+      for (i=0;i<n;i++)
+         xx[i] = x[i];
+      for (i=0;i<overlap;i++)
+      {
+         xx[i] = MULT16_16_Q15(x[i],window[i]);
+         xx[n-i-1] = MULT16_16_Q15(x[n-i-1],window[i]);
+      }
+      xptr = xx;
    }
+   shift=0;
 #ifdef FIXED_POINT
    {
-      opus_val32 ac0=0;
-      int shift;
-      for(i=0;i<n;i++)
-         ac0 += SHR32(MULT16_16(xx[i],xx[i]),9);
-      ac0 += 1+n;
+      opus_val32 ac0;
+      ac0 = 1+(n<<7);
+      if (n&1) ac0 += SHR32(MULT16_16(xptr[0],xptr[0]),9);
+      for(i=(n&1);i<n;i+=2)
+      {
+         ac0 += SHR32(MULT16_16(xptr[i],xptr[i]),9);
+         ac0 += SHR32(MULT16_16(xptr[i+1],xptr[i+1]),9);
+      }
 
       shift = celt_ilog2(ac0)-30+10;
-      shift = (shift+1)/2;
-      for(i=0;i<n;i++)
-         xx[i] = VSHR32(xx[i], shift);
+      shift = (shift)/2;
+      if (shift>0)
+      {
+         for(i=0;i<n;i++)
+            xx[i] = PSHR32(xptr[i], shift);
+         xptr = xx;
+      } else
+         shift = 0;
    }
 #endif
-   while (lag>=0)
+   celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);
+   for (k=0;k<=lag;k++)
    {
-      for (i = lag, d = 0; i < n; i++)
-         d += xx[i] * xx[i-lag];
-      ac[lag] = d;
-      /*printf ("%f ", ac[lag]);*/
-      lag--;
+      for (i = k+fastN, d = 0; i < n; i++)
+         d = MAC16_16(d, xptr[i], xptr[i-k]);
+      ac[k] += d;
    }
-   /*printf ("\n");*/
-   ac[0] += 10;
+#ifdef FIXED_POINT
+   shift = 2*shift;
+   if (shift<=0)
+      ac[0] += SHL32((opus_int32)1, -shift);
+   if (ac[0] < 268435456)
+   {
+      int shift2 = 29 - EC_ILOG(ac[0]);
+      for (i=0;i<=lag;i++)
+         ac[i] = SHL32(ac[i], shift2);
+      shift -= shift2;
+   } else if (ac[0] >= 536870912)
+   {
+      int shift2=1;
+      if (ac[0] >= 1073741824)
+         shift2++;
+      for (i=0;i<=lag;i++)
+         ac[i] = SHR32(ac[i], shift2);
+      shift += shift2;
+   }
+#endif
 
    RESTORE_STACK;
+   return shift;
 }
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index 2baa77e..19279a0 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -48,6 +48,6 @@ void celt_iir(const opus_val32 *x,
          int ord,
          opus_val16 *mem);
 
-void _celt_autocorr(const opus_val16 *x, opus_val32 *ac, const opus_val16 *window, int overlap, int lag, int n);
+int _celt_autocorr(const opus_val16 *x, opus_val32 *ac, const opus_val16 *window, int overlap, int lag, int n);
 
 #endif /* PLC_H */
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
new file mode 100644
index 0000000..41481fe
--- /dev/null
+++ b/celt/cpu_support.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CPU_SUPPORT_H
+#define CPU_SUPPORT_H
+
+#if defined(OPUS_HAVE_RTCD) && defined(ARMv4_ASM)
+#include "arm/armcpu.h"
+
+/* We currently support 4 ARM variants:
+ * arch[0] -> ARMv4
+ * arch[1] -> ARMv5E
+ * arch[2] -> ARMv6
+ * arch[3] -> NEON
+ */
+#define OPUS_ARCHMASK 3
+
+#else
+#define OPUS_ARCHMASK 0
+
+static inline int opus_select_arch(void)
+{
+  return 0;
+}
+#endif
+
+#endif
diff --git a/celt/cwrs.c b/celt/cwrs.c
index 8edc919..28e6561 100644
--- a/celt/cwrs.c
+++ b/celt/cwrs.c
@@ -71,64 +71,6 @@ int log2_frac(opus_uint32 val, int frac)
 }
 #endif
 
-#ifndef SMALL_FOOTPRINT
-
-#define MASK32 (0xFFFFFFFF)
-
-/*INV_TABLE[i] holds the multiplicative inverse of (2*i+1) mod 2**32.*/
-static const opus_uint32 INV_TABLE[53]={
-  0x00000001,0xAAAAAAAB,0xCCCCCCCD,0xB6DB6DB7,
-  0x38E38E39,0xBA2E8BA3,0xC4EC4EC5,0xEEEEEEEF,
-  0xF0F0F0F1,0x286BCA1B,0x3CF3CF3D,0xE9BD37A7,
-  0xC28F5C29,0x684BDA13,0x4F72C235,0xBDEF7BDF,
-  0x3E0F83E1,0x8AF8AF8B,0x914C1BAD,0x96F96F97,
-  0xC18F9C19,0x2FA0BE83,0xA4FA4FA5,0x677D46CF,
-  0x1A1F58D1,0xFAFAFAFB,0x8C13521D,0x586FB587,
-  0xB823EE09,0xA08AD8F3,0xC10C9715,0xBEFBEFBF,
-  0xC0FC0FC1,0x07A44C6B,0xA33F128D,0xE327A977,
-  0xC7E3F1F9,0x962FC963,0x3F2B3885,0x613716AF,
-  0x781948B1,0x2B2E43DB,0xFCFCFCFD,0x6FD0EB67,
-  0xFA3F47E9,0xD2FD2FD3,0x3F4FD3F5,0xD4E25B9F,
-  0x5F02A3A1,0xBF5A814B,0x7C32B16D,0xD3431B57,
-  0xD8FD8FD9,
-};
-
-/*Computes (_a*_b-_c)/(2*_d+1) when the quotient is known to be exact.
-  _a, _b, _c, and _d may be arbitrary so long as the arbitrary precision result
-   fits in 32 bits, but currently the table for multiplicative inverses is only
-   valid for _d<=52.*/
-static inline opus_uint32 imusdiv32odd(opus_uint32 _a,opus_uint32 _b,
- opus_uint32 _c,int _d){
-  celt_assert(_d<=52);
-  return (_a*_b-_c)*INV_TABLE[_d]&MASK32;
-}
-
-/*Computes (_a*_b-_c)/_d when the quotient is known to be exact.
-  _d does not actually have to be even, but imusdiv32odd will be faster when
-   it's odd, so you should use that instead.
-  _a and _d are assumed to be small (e.g., _a*_d fits in 32 bits; currently the
-   table for multiplicative inverses is only valid for _d<=54).
-  _b and _c may be arbitrary so long as the arbitrary precision reuslt fits in
-   32 bits.*/
-static inline opus_uint32 imusdiv32even(opus_uint32 _a,opus_uint32 _b,
- opus_uint32 _c,int _d){
-  opus_uint32 inv;
-  int           mask;
-  int           shift;
-  int           one;
-  celt_assert(_d>0);
-  celt_assert(_d<=54);
-  shift=EC_ILOG(_d^(_d-1));
-  inv=INV_TABLE[(_d-1)>>shift];
-  shift--;
-  one=1<<shift;
-  mask=one-1;
-  return (_a*(_b>>shift)-(_c>>shift)+
-   ((_a*(_b&mask)+one-(_c&mask))>>shift)-1)*inv&MASK32;
-}
-
-#endif /* SMALL_FOOTPRINT */
-
 /*Although derived separately, the pulse vector coding scheme is equivalent to
    a Pyramid Vector Quantizer \cite{Fis86}.
   Some additional notes about an early version appear at
@@ -248,46 +190,346 @@ static inline opus_uint32 imusdiv32even(opus_uint32 _a,opus_uint32 _b,
     year=1986
   }*/
 
-#ifndef SMALL_FOOTPRINT
-/*Compute U(2,_k).
-  Note that this may be called with _k=32768 (maxK[2]+1).*/
-static inline unsigned ucwrs2(unsigned _k){
-  celt_assert(_k>0);
-  return _k+(_k-1);
-}
+#if !defined(SMALL_FOOTPRINT)
+
+/*U(N,K) = U(K,N) := N>0?K>0?U(N-1,K)+U(N,K-1)+U(N-1,K-1):0:K>0?1:0*/
+# define CELT_PVQ_U(_n,_k) (CELT_PVQ_U_ROW[IMIN(_n,_k)][IMAX(_n,_k)])
+/*V(N,K) := U(N,K)+U(N,K+1) = the number of PVQ codewords for a band of size N
+   with K pulses allocated to it.*/
+# define CELT_PVQ_V(_n,_k) (CELT_PVQ_U(_n,_k)+CELT_PVQ_U(_n,(_k)+1))
+
+/*For each V(N,K) supported, we will access element U(min(N,K+1),max(N,K+1)).
+  Thus, the number of entries in row I is the larger of the maximum number of
+   pulses we will ever allocate for a given N=I (K=128, or however many fit in
+   32 bits, whichever is smaller), plus one, and the maximum N for which
+   K=I-1 pulses fit in 32 bits.
+  The largest band size in an Opus Custom mode is 208.
+  Otherwise, we can limit things to the set of N which can be achieved by
+   splitting a band from a standard Opus mode: 176, 144, 96, 88, 72, 64, 48,
+   44, 36, 32, 24, 22, 18, 16, 8, 4, 2).*/
+#if defined(CUSTOM_MODES)
+static const opus_uint32 CELT_PVQ_U_DATA[1488]={
+#else
+static const opus_uint32 CELT_PVQ_U_DATA[1272]={
+#endif
+  /*N=0, K=0...176:*/
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0,
+#endif
+  /*N=1, K=1...176:*/
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1,
+#endif
+  /*N=2, K=2...176:*/
+  3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
+  43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79,
+  81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113,
+  115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139, 141, 143,
+  145, 147, 149, 151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173,
+  175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203,
+  205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233,
+  235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, 257, 259, 261, 263,
+  265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293,
+  295, 297, 299, 301, 303, 305, 307, 309, 311, 313, 315, 317, 319, 321, 323,
+  325, 327, 329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 377, 379, 381,
+  383, 385, 387, 389, 391, 393, 395, 397, 399, 401, 403, 405, 407, 409, 411,
+  413, 415,
+#endif
+  /*N=3, K=3...176:*/
+  13, 25, 41, 61, 85, 113, 145, 181, 221, 265, 313, 365, 421, 481, 545, 613,
+  685, 761, 841, 925, 1013, 1105, 1201, 1301, 1405, 1513, 1625, 1741, 1861,
+  1985, 2113, 2245, 2381, 2521, 2665, 2813, 2965, 3121, 3281, 3445, 3613, 3785,
+  3961, 4141, 4325, 4513, 4705, 4901, 5101, 5305, 5513, 5725, 5941, 6161, 6385,
+  6613, 6845, 7081, 7321, 7565, 7813, 8065, 8321, 8581, 8845, 9113, 9385, 9661,
+  9941, 10225, 10513, 10805, 11101, 11401, 11705, 12013, 12325, 12641, 12961,
+  13285, 13613, 13945, 14281, 14621, 14965, 15313, 15665, 16021, 16381, 16745,
+  17113, 17485, 17861, 18241, 18625, 19013, 19405, 19801, 20201, 20605, 21013,
+  21425, 21841, 22261, 22685, 23113, 23545, 23981, 24421, 24865, 25313, 25765,
+  26221, 26681, 27145, 27613, 28085, 28561, 29041, 29525, 30013, 30505, 31001,
+  31501, 32005, 32513, 33025, 33541, 34061, 34585, 35113, 35645, 36181, 36721,
+  37265, 37813, 38365, 38921, 39481, 40045, 40613, 41185, 41761, 42341, 42925,
+  43513, 44105, 44701, 45301, 45905, 46513, 47125, 47741, 48361, 48985, 49613,
+  50245, 50881, 51521, 52165, 52813, 53465, 54121, 54781, 55445, 56113, 56785,
+  57461, 58141, 58825, 59513, 60205, 60901, 61601,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  62305, 63013, 63725, 64441, 65161, 65885, 66613, 67345, 68081, 68821, 69565,
+  70313, 71065, 71821, 72581, 73345, 74113, 74885, 75661, 76441, 77225, 78013,
+  78805, 79601, 80401, 81205, 82013, 82825, 83641, 84461, 85285, 86113,
+#endif
+  /*N=4, K=4...176:*/
+  63, 129, 231, 377, 575, 833, 1159, 1561, 2047, 2625, 3303, 4089, 4991, 6017,
+  7175, 8473, 9919, 11521, 13287, 15225, 17343, 19649, 22151, 24857, 27775,
+  30913, 34279, 37881, 41727, 45825, 50183, 54809, 59711, 64897, 70375, 76153,
+  82239, 88641, 95367, 102425, 109823, 117569, 125671, 134137, 142975, 152193,
+  161799, 171801, 182207, 193025, 204263, 215929, 228031, 240577, 253575,
+  267033, 280959, 295361, 310247, 325625, 341503, 357889, 374791, 392217,
+  410175, 428673, 447719, 467321, 487487, 508225, 529543, 551449, 573951,
+  597057, 620775, 645113, 670079, 695681, 721927, 748825, 776383, 804609,
+  833511, 863097, 893375, 924353, 956039, 988441, 1021567, 1055425, 1090023,
+  1125369, 1161471, 1198337, 1235975, 1274393, 1313599, 1353601, 1394407,
+  1436025, 1478463, 1521729, 1565831, 1610777, 1656575, 1703233, 1750759,
+  1799161, 1848447, 1898625, 1949703, 2001689, 2054591, 2108417, 2163175,
+  2218873, 2275519, 2333121, 2391687, 2451225, 2511743, 2573249, 2635751,
+  2699257, 2763775, 2829313, 2895879, 2963481, 3032127, 3101825, 3172583,
+  3244409, 3317311, 3391297, 3466375, 3542553, 3619839, 3698241, 3777767,
+  3858425, 3940223, 4023169, 4107271, 4192537, 4278975, 4366593, 4455399,
+  4545401, 4636607, 4729025, 4822663, 4917529, 5013631, 5110977, 5209575,
+  5309433, 5410559, 5512961, 5616647, 5721625, 5827903, 5935489, 6044391,
+  6154617, 6266175, 6379073, 6493319, 6608921, 6725887, 6844225, 6963943,
+  7085049, 7207551,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  7331457, 7456775, 7583513, 7711679, 7841281, 7972327, 8104825, 8238783,
+  8374209, 8511111, 8649497, 8789375, 8930753, 9073639, 9218041, 9363967,
+  9511425, 9660423, 9810969, 9963071, 10116737, 10271975, 10428793, 10587199,
+  10747201, 10908807, 11072025, 11236863, 11403329, 11571431, 11741177,
+  11912575,
+#endif
+  /*N=5, K=5...176:*/
+  321, 681, 1289, 2241, 3649, 5641, 8361, 11969, 16641, 22569, 29961, 39041,
+  50049, 63241, 78889, 97281, 118721, 143529, 172041, 204609, 241601, 283401,
+  330409, 383041, 441729, 506921, 579081, 658689, 746241, 842249, 947241,
+  1061761, 1186369, 1321641, 1468169, 1626561, 1797441, 1981449, 2179241,
+  2391489, 2618881, 2862121, 3121929, 3399041, 3694209, 4008201, 4341801,
+  4695809, 5071041, 5468329, 5888521, 6332481, 6801089, 7295241, 7815849,
+  8363841, 8940161, 9545769, 10181641, 10848769, 11548161, 12280841, 13047849,
+  13850241, 14689089, 15565481, 16480521, 17435329, 18431041, 19468809,
+  20549801, 21675201, 22846209, 24064041, 25329929, 26645121, 28010881,
+  29428489, 30899241, 32424449, 34005441, 35643561, 37340169, 39096641,
+  40914369, 42794761, 44739241, 46749249, 48826241, 50971689, 53187081,
+  55473921, 57833729, 60268041, 62778409, 65366401, 68033601, 70781609,
+  73612041, 76526529, 79526721, 82614281, 85790889, 89058241, 92418049,
+  95872041, 99421961, 103069569, 106816641, 110664969, 114616361, 118672641,
+  122835649, 127107241, 131489289, 135983681, 140592321, 145317129, 150160041,
+  155123009, 160208001, 165417001, 170752009, 176215041, 181808129, 187533321,
+  193392681, 199388289, 205522241, 211796649, 218213641, 224775361, 231483969,
+  238341641, 245350569, 252512961, 259831041, 267307049, 274943241, 282741889,
+  290705281, 298835721, 307135529, 315607041, 324252609, 333074601, 342075401,
+  351257409, 360623041, 370174729, 379914921, 389846081, 399970689, 410291241,
+  420810249, 431530241, 442453761, 453583369, 464921641, 476471169, 488234561,
+  500214441, 512413449, 524834241, 537479489, 550351881, 563454121, 576788929,
+  590359041, 604167209, 618216201, 632508801,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  647047809, 661836041, 676876329, 692171521, 707724481, 723538089, 739615241,
+  755958849, 772571841, 789457161, 806617769, 824056641, 841776769, 859781161,
+  878072841, 896654849, 915530241, 934702089, 954173481, 973947521, 994027329,
+  1014416041, 1035116809, 1056132801, 1077467201, 1099123209, 1121104041,
+  1143412929, 1166053121, 1189027881, 1212340489, 1235994241,
+#endif
+  /*N=6, K=6...96:*/
+  1683, 3653, 7183, 13073, 22363, 36365, 56695, 85305, 124515, 177045, 246047,
+  335137, 448427, 590557, 766727, 982729, 1244979, 1560549, 1937199, 2383409,
+  2908411, 3522221, 4235671, 5060441, 6009091, 7095093, 8332863, 9737793,
+  11326283, 13115773, 15124775, 17372905, 19880915, 22670725, 25765455,
+  29189457, 32968347, 37129037, 41699767, 46710137, 52191139, 58175189,
+  64696159, 71789409, 79491819, 87841821, 96879431, 106646281, 117185651,
+  128542501, 140763503, 153897073, 167993403, 183104493, 199284183, 216588185,
+  235074115, 254801525, 275831935, 298228865, 322057867, 347386557, 374284647,
+  402823977, 433078547, 465124549, 499040399, 534906769, 572806619, 612825229,
+  655050231, 699571641, 746481891, 795875861, 847850911, 902506913, 959946283,
+  1020274013, 1083597703, 1150027593, 1219676595, 1292660325, 1369097135,
+  1449108145, 1532817275, 1620351277, 1711839767, 1807415257, 1907213187,
+  2011371957, 2120032959,
+#if defined(CUSTOM_MODES)
+  /*...109:*/
+  2233340609U, 2351442379U, 2474488829U, 2602633639U, 2736033641U, 2874848851U,
+  3019242501U, 3169381071U, 3325434321U, 3487575323U, 3655980493U, 3830829623U,
+  4012305913U,
+#endif
+  /*N=7, K=7...54*/
+  8989, 19825, 40081, 75517, 134245, 227305, 369305, 579125, 880685, 1303777,
+  1884961, 2668525, 3707509, 5064793, 6814249, 9041957, 11847485, 15345233,
+  19665841, 24957661, 31388293, 39146185, 48442297, 59511829, 72616013,
+  88043969, 106114625, 127178701, 151620757, 179861305, 212358985, 249612805,
+  292164445, 340600625, 395555537, 457713341, 527810725, 606639529, 695049433,
+  793950709, 904317037, 1027188385, 1163673953, 1314955181, 1482288821,
+  1667010073, 1870535785, 2094367717,
+#if defined(CUSTOM_MODES)
+  /*...60:*/
+  2340095869U, 2609401873U, 2904062449U, 3225952925U, 3577050821U, 3959439497U,
+#endif
+  /*N=8, K=8...37*/
+  48639, 108545, 224143, 433905, 795455, 1392065, 2340495, 3800305, 5984767,
+  9173505, 13726991, 20103025, 28875327, 40754369, 56610575, 77500017,
+  104692735, 139703809, 184327311, 240673265, 311207743, 398796225, 506750351,
+  638878193, 799538175, 993696769, 1226990095, 1505789553, 1837271615,
+  2229491905U,
+#if defined(CUSTOM_MODES)
+  /*...40:*/
+  2691463695U, 3233240945U, 3866006015U,
+#endif
+  /*N=9, K=9...28:*/
+  265729, 598417, 1256465, 2485825, 4673345, 8405905, 14546705, 24331777,
+  39490049, 62390545, 96220561, 145198913, 214828609, 312193553, 446304145,
+  628496897, 872893441, 1196924561, 1621925137, 2173806145U,
+#if defined(CUSTOM_MODES)
+  /*...29:*/
+  2883810113U,
+#endif
+  /*N=10, K=10...24:*/
+  1462563, 3317445, 7059735, 14218905, 27298155, 50250765, 89129247, 152951073,
+  254831667, 413442773, 654862247, 1014889769, 1541911931, 2300409629U,
+  3375210671U,
+  /*N=11, K=11...19:*/
+  8097453, 18474633, 39753273, 81270333, 158819253, 298199265, 540279585,
+  948062325, 1616336765,
+#if defined(CUSTOM_MODES)
+  /*...20:*/
+  2684641785U,
+#endif
+  /*N=12, K=12...18:*/
+  45046719, 103274625, 224298231, 464387817, 921406335, 1759885185,
+  3248227095U,
+  /*N=13, K=13...16:*/
+  251595969, 579168825, 1267854873, 2653649025U,
+  /*N=14, K=14:*/
+  1409933619
+};
 
-/*Compute V(2,_k).*/
-static inline opus_uint32 ncwrs2(int _k){
-  celt_assert(_k>0);
-  return 4*(opus_uint32)_k;
+#if defined(CUSTOM_MODES)
+const opus_uint32 *const CELT_PVQ_U_ROW[15]={
+  CELT_PVQ_U_DATA+   0,CELT_PVQ_U_DATA+ 208,CELT_PVQ_U_DATA+ 415,
+  CELT_PVQ_U_DATA+ 621,CELT_PVQ_U_DATA+ 826,CELT_PVQ_U_DATA+1030,
+  CELT_PVQ_U_DATA+1233,CELT_PVQ_U_DATA+1336,CELT_PVQ_U_DATA+1389,
+  CELT_PVQ_U_DATA+1421,CELT_PVQ_U_DATA+1441,CELT_PVQ_U_DATA+1455,
+  CELT_PVQ_U_DATA+1464,CELT_PVQ_U_DATA+1470,CELT_PVQ_U_DATA+1473
+};
+#else
+const opus_uint32 *const CELT_PVQ_U_ROW[15]={
+  CELT_PVQ_U_DATA+   0,CELT_PVQ_U_DATA+ 176,CELT_PVQ_U_DATA+ 351,
+  CELT_PVQ_U_DATA+ 525,CELT_PVQ_U_DATA+ 698,CELT_PVQ_U_DATA+ 870,
+  CELT_PVQ_U_DATA+1041,CELT_PVQ_U_DATA+1131,CELT_PVQ_U_DATA+1178,
+  CELT_PVQ_U_DATA+1207,CELT_PVQ_U_DATA+1226,CELT_PVQ_U_DATA+1240,
+  CELT_PVQ_U_DATA+1248,CELT_PVQ_U_DATA+1254,CELT_PVQ_U_DATA+1257
+};
+#endif
+
+#if defined(CUSTOM_MODES)
+void get_required_bits(opus_int16 *_bits,int _n,int _maxk,int _frac){
+  int k;
+  /*_maxk==0 => there's nothing to do.*/
+  celt_assert(_maxk>0);
+  _bits[0]=0;
+  for(k=1;k<=_maxk;k++)_bits[k]=log2_frac(CELT_PVQ_V(_n,k),_frac);
 }
+#endif
 
-/*Compute U(3,_k).
-  Note that this may be called with _k=32768 (maxK[3]+1).*/
-static inline opus_uint32 ucwrs3(unsigned _k){
-  celt_assert(_k>0);
-  return (2*(opus_uint32)_k-2)*_k+1;
+static opus_uint32 icwrs(int _n,const int *_y){
+  opus_uint32 i;
+  int         j;
+  int         k;
+  celt_assert(_n>=2);
+  j=_n-1;
+  i=_y[j]<0;
+  k=abs(_y[j]);
+  do{
+    j--;
+    i+=CELT_PVQ_U(_n-j,k);
+    k+=abs(_y[j]);
+    if(_y[j]<0)i+=CELT_PVQ_U(_n-j,k+1);
+  }
+  while(j>0);
+  return i;
 }
 
-/*Compute V(3,_k).*/
-static inline opus_uint32 ncwrs3(int _k){
+void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){
   celt_assert(_k>0);
-  return 2*(2*(unsigned)_k*(opus_uint32)_k+1);
+  ec_enc_uint(_enc,icwrs(_n,_y),CELT_PVQ_V(_n,_k));
 }
 
-/*Compute U(4,_k).*/
-static inline opus_uint32 ucwrs4(int _k){
+static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
+  opus_uint32 p;
+  int         s;
+  int         k0;
   celt_assert(_k>0);
-  return imusdiv32odd(2*_k,(2*_k-3)*(opus_uint32)_k+4,3,1);
+  celt_assert(_n>1);
+  while(_n>2){
+    opus_uint32 q;
+    /*Lots of pulses case:*/
+    if(_k>=_n){
+      const opus_uint32 *row;
+      row=CELT_PVQ_U_ROW[_n];
+      /*Are the pulses in this dimension negative?*/
+      p=row[_k+1];
+      s=-(_i>=p);
+      _i-=p&s;
+      /*Count how many pulses were placed in this dimension.*/
+      k0=_k;
+      q=row[_n];
+      if(q>_i){
+        celt_assert(p>q);
+        _k=_n;
+        do p=CELT_PVQ_U_ROW[--_k][_n];
+        while(p>_i);
+      }
+      else for(p=row[_k];p>_i;p=row[_k])_k--;
+      _i-=p;
+      *_y++=(k0-_k+s)^s;
+    }
+    /*Lots of dimensions case:*/
+    else{
+      /*Are there any pulses in this dimension at all?*/
+      p=CELT_PVQ_U_ROW[_k][_n];
+      q=CELT_PVQ_U_ROW[_k+1][_n];
+      if(p<=_i&&_i<q){
+        _i-=p;
+        *_y++=0;
+      }
+      else{
+        /*Are the pulses in this dimension negative?*/
+        s=-(_i>=q);
+        _i-=q&s;
+        /*Count how many pulses were placed in this dimension.*/
+        k0=_k;
+        do p=CELT_PVQ_U_ROW[--_k][_n];
+        while(p>_i);
+        _i-=p;
+        *_y++=(k0-_k+s)^s;
+      }
+    }
+    _n--;
+  }
+  /*_n==2*/
+  p=2*_k+1;
+  s=-(_i>=p);
+  _i-=p&s;
+  k0=_k;
+  _k=(_i+1)>>1;
+  if(_k)_i-=2*_k-1;
+  *_y++=(k0-_k+s)^s;
+  /*_n==1*/
+  s=-(int)_i;
+  *_y=(_k+s)^s;
 }
 
-/*Compute V(4,_k).*/
-static inline opus_uint32 ncwrs4(int _k){
-  celt_assert(_k>0);
-  return ((_k*(opus_uint32)_k+2)*_k)/3<<3;
+void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
+  cwrsi(_n,_k,ec_dec_uint(_dec,CELT_PVQ_V(_n,_k)),_y);
 }
 
-#endif /* SMALL_FOOTPRINT */
+#else /* SMALL_FOOTPRINT */
 
 /*Computes the next row/column of any recurrence that obeys the relation
    u[i][j]=u[i-1][j]+u[i][j-1]+u[i-1][j-1].
@@ -332,125 +574,18 @@ static opus_uint32 ncwrs_urow(unsigned _n,unsigned _k,opus_uint32 *_u){
   celt_assert(len>=3);
   _u[0]=0;
   _u[1]=um2=1;
-#ifndef SMALL_FOOTPRINT
-  /*_k>52 doesn't work in the false branch due to the limits of INV_TABLE,
-    but _k isn't tested here because k<=52 for n=7*/
-  if(_n<=6)
-#endif
-  {
-    /*If _n==0, _u[0] should be 1 and the rest should be 0.*/
-    /*If _n==1, _u[i] should be 1 for i>1.*/
-    celt_assert(_n>=2);
-    /*If _k==0, the following do-while loop will overflow the buffer.*/
-    celt_assert(_k>0);
-    k=2;
-    do _u[k]=(k<<1)-1;
-    while(++k<len);
-    for(k=2;k<_n;k++)unext(_u+1,_k+1,1);
-  }
-#ifndef SMALL_FOOTPRINT
-  else{
-    opus_uint32 um1;
-    opus_uint32 n2m1;
-    _u[2]=n2m1=um1=(_n<<1)-1;
-    for(k=3;k<len;k++){
-      /*U(N,K) = ((2*N-1)*U(N,K-1)-U(N,K-2))/(K-1) + U(N,K-2)*/
-      _u[k]=um2=imusdiv32even(n2m1,um1,um2,k-1)+um2;
-      if(++k>=len)break;
-      _u[k]=um1=imusdiv32odd(n2m1,um2,um1,(k-1)>>1)+um1;
-    }
-  }
-#endif /* SMALL_FOOTPRINT */
+  /*If _n==0, _u[0] should be 1 and the rest should be 0.*/
+  /*If _n==1, _u[i] should be 1 for i>1.*/
+  celt_assert(_n>=2);
+  /*If _k==0, the following do-while loop will overflow the buffer.*/
+  celt_assert(_k>0);
+  k=2;
+  do _u[k]=(k<<1)-1;
+  while(++k<len);
+  for(k=2;k<_n;k++)unext(_u+1,_k+1,1);
   return _u[_k]+_u[_k+1];
 }
 
-#ifndef SMALL_FOOTPRINT
-
-/*Returns the _i'th combination of _k elements (at most 32767) chosen from a
-   set of size 1 with associated sign bits.
-  _y: Returns the vector of pulses.*/
-static inline void cwrsi1(int _k,opus_uint32 _i,int *_y){
-  int s;
-  s=-(int)_i;
-  _y[0]=(_k+s)^s;
-}
-
-/*Returns the _i'th combination of _k elements (at most 32767) chosen from a
-   set of size 2 with associated sign bits.
-  _y: Returns the vector of pulses.*/
-static inline void cwrsi2(int _k,opus_uint32 _i,int *_y){
-  opus_uint32 p;
-  int           s;
-  int           yj;
-  p=ucwrs2(_k+1U);
-  s=-(_i>=p);
-  _i-=p&s;
-  yj=_k;
-  _k=(_i+1)>>1;
-  p=_k?ucwrs2(_k):0;
-  _i-=p;
-  yj-=_k;
-  _y[0]=(yj+s)^s;
-  cwrsi1(_k,_i,_y+1);
-}
-
-/*Returns the _i'th combination of _k elements (at most 32767) chosen from a
-   set of size 3 with associated sign bits.
-  _y: Returns the vector of pulses.*/
-static void cwrsi3(int _k,opus_uint32 _i,int *_y){
-  opus_uint32 p;
-  int           s;
-  int           yj;
-  p=ucwrs3(_k+1U);
-  s=-(_i>=p);
-  _i-=p&s;
-  yj=_k;
-  /*Finds the maximum _k such that ucwrs3(_k)<=_i (tested for all
-     _i<2147418113=U(3,32768)).*/
-  _k=_i>0?(isqrt32(2*_i-1)+1)>>1:0;
-  p=_k?ucwrs3(_k):0;
-  _i-=p;
-  yj-=_k;
-  _y[0]=(yj+s)^s;
-  cwrsi2(_k,_i,_y+1);
-}
-
-/*Returns the _i'th combination of _k elements (at most 1172) chosen from a set
-   of size 4 with associated sign bits.
-  _y: Returns the vector of pulses.*/
-static void cwrsi4(int _k,opus_uint32 _i,int *_y){
-  opus_uint32 p;
-  int           s;
-  int           yj;
-  int           kl;
-  int           kr;
-  p=ucwrs4(_k+1);
-  s=-(_i>=p);
-  _i-=p&s;
-  yj=_k;
-  /*We could solve a cubic for k here, but the form of the direct solution does
-     not lend itself well to exact integer arithmetic.
-    Instead we do a binary search on U(4,K).*/
-  kl=0;
-  kr=_k;
-  for(;;){
-    _k=(kl+kr)>>1;
-    p=_k?ucwrs4(_k):0;
-    if(p<_i){
-      if(_k>=kr)break;
-      kl=_k+1;
-    }
-    else if(p>_i)kr=_k-1;
-    else break;
-  }
-  _i-=p;
-  yj-=_k;
-  _y[0]=(yj+s)^s;
-  cwrsi3(_k,_i,_y+1);
-}
-
-#endif /* SMALL_FOOTPRINT */
-
 /*Returns the _i'th combination of _k elements chosen from a set of size _n
    with associated sign bits.
   _y: Returns the vector of pulses.
@@ -487,55 +622,6 @@ static inline opus_uint32 icwrs1(const int *_y,int *_k){
   return _y[0]<0;
 }
 
-#ifndef SMALL_FOOTPRINT
-
-/*Returns the index of the given combination of K elements chosen from a set
-   of size 2 with associated sign bits.
-  _y: The vector of pulses, whose sum of absolute values is K.
-  _k: Returns K.*/
-static inline opus_uint32 icwrs2(const int *_y,int *_k){
-  opus_uint32 i;
-  int           k;
-  i=icwrs1(_y+1,&k);
-  i+=k?ucwrs2(k):0;
-  k+=abs(_y[0]);
-  if(_y[0]<0)i+=ucwrs2(k+1U);
-  *_k=k;
-  return i;
-}
-
-/*Returns the index of the given combination of K elements chosen from a set
-   of size 3 with associated sign bits.
-  _y: The vector of pulses, whose sum of absolute values is K.
-  _k: Returns K.*/
-static inline opus_uint32 icwrs3(const int *_y,int *_k){
-  opus_uint32 i;
-  int           k;
-  i=icwrs2(_y+1,&k);
-  i+=k?ucwrs3(k):0;
-  k+=abs(_y[0]);
-  if(_y[0]<0)i+=ucwrs3(k+1U);
-  *_k=k;
-  return i;
-}
-
-/*Returns the index of the given combination of K elements chosen from a set
-   of size 4 with associated sign bits.
-  _y: The vector of pulses, whose sum of absolute values is K.
-  _k: Returns K.*/
-static inline opus_uint32 icwrs4(const int *_y,int *_k){
-  opus_uint32 i;
-  int           k;
-  i=icwrs3(_y+1,&k);
-  i+=k?ucwrs4(k):0;
-  k+=abs(_y[0]);
-  if(_y[0]<0)i+=ucwrs4(k+1);
-  *_k=k;
-  return i;
-}
-
-#endif /* SMALL_FOOTPRINT */
-
 /*Returns the index of the given combination of K elements chosen from a set
    of size _n with associated sign bits.
   _y:  The vector of pulses, whose sum of absolute values must be _k.
@@ -543,8 +629,8 @@ static inline opus_uint32 icwrs4(const int *_y,int *_k){
 static inline opus_uint32 icwrs(int _n,int _k,opus_uint32 *_nc,const int *_y,
  opus_uint32 *_u){
   opus_uint32 i;
-  int           j;
-  int           k;
+  int         j;
+  int         k;
   /*We can't unroll the first two iterations of the loop unless _n>=2.*/
   celt_assert(_n>=2);
   _u[0]=0;
@@ -589,57 +675,23 @@ void get_required_bits(opus_int16 *_bits,int _n,int _maxk,int _frac){
 
 void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){
   opus_uint32 i;
+  VARDECL(opus_uint32,u);
+  opus_uint32 nc;
+  SAVE_STACK;
   celt_assert(_k>0);
-#ifndef SMALL_FOOTPRINT
-  switch(_n){
-    case 2:{
-      i=icwrs2(_y,&_k);
-      ec_enc_uint(_enc,i,ncwrs2(_k));
-    }break;
-    case 3:{
-      i=icwrs3(_y,&_k);
-      ec_enc_uint(_enc,i,ncwrs3(_k));
-    }break;
-    case 4:{
-      i=icwrs4(_y,&_k);
-      ec_enc_uint(_enc,i,ncwrs4(_k));
-    }break;
-     default:
-    {
-#endif
-      VARDECL(opus_uint32,u);
-      opus_uint32 nc;
-      SAVE_STACK;
-      ALLOC(u,_k+2U,opus_uint32);
-      i=icwrs(_n,_k,&nc,_y,u);
-      ec_enc_uint(_enc,i,nc);
-      RESTORE_STACK;
-#ifndef SMALL_FOOTPRINT
-    }
-    break;
-  }
-#endif
+  ALLOC(u,_k+2U,opus_uint32);
+  i=icwrs(_n,_k,&nc,_y,u);
+  ec_enc_uint(_enc,i,nc);
+  RESTORE_STACK;
 }
 
-void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec)
-{
+void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
+  VARDECL(opus_uint32,u);
+  SAVE_STACK;
   celt_assert(_k>0);
-#ifndef SMALL_FOOTPRINT
-   switch(_n){
-    case 2:cwrsi2(_k,ec_dec_uint(_dec,ncwrs2(_k)),_y);break;
-    case 3:cwrsi3(_k,ec_dec_uint(_dec,ncwrs3(_k)),_y);break;
-    case 4:cwrsi4(_k,ec_dec_uint(_dec,ncwrs4(_k)),_y);break;
-    default:
-    {
-#endif
-      VARDECL(opus_uint32,u);
-      SAVE_STACK;
-      ALLOC(u,_k+2U,opus_uint32);
-      cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
-      RESTORE_STACK;
-#ifndef SMALL_FOOTPRINT
-    }
-    break;
-  }
-#endif
+  ALLOC(u,_k+2U,opus_uint32);
+  cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
+  RESTORE_STACK;
 }
+
+#endif /* SMALL_FOOTPRINT */
diff --git a/celt/dump_modes/Makefile b/celt/dump_modes/Makefile
new file mode 100644
index 0000000..371a7d4
--- /dev/null
+++ b/celt/dump_modes/Makefile
@@ -0,0 +1,10 @@
+CFLAGS=-O2 -Wall -Wextra -DHAVE_CONFIG_H
+INCLUDES=-I../ -I../.. -I../../include
+
+all: dump_modes
+
+dump_modes:
+	$(CC) $(CFLAGS) $(INCLUDES) -DCUSTOM_MODES dump_modes.c ../modes.c ../cwrs.c ../rate.c ../entenc.c ../entdec.c ../mathops.c ../mdct.c ../kiss_fft.c -o dump_modes -lm 
+
+clean:
+	rm -f dump_modes
diff --git a/celt/dump_modes/dump_modes.c b/celt/dump_modes/dump_modes.c
new file mode 100644
index 0000000..8c07e19
--- /dev/null
+++ b/celt/dump_modes/dump_modes.c
@@ -0,0 +1,329 @@
+/* Copyright (c) 2008 CSIRO
+   Copyright (c) 2008-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "modes.h"
+#include "celt.h"
+#include "rate.h"
+
+#define INT16 "%d"
+#define INT32 "%d"
+#define FLOAT "%#0.8gf"
+
+#ifdef FIXED_POINT
+#define WORD16 INT16
+#define WORD32 INT32
+#else
+#define WORD16 FLOAT
+#define WORD32 FLOAT
+#endif
+
+void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
+{
+   int i, j, k;
+   fprintf(file, "/* The contents of this file was automatically generated by dump_modes.c\n");
+   fprintf(file, "   with arguments:");
+   for (i=0;i<nb_modes;i++)
+   {
+      CELTMode *mode = modes[i];
+      fprintf(file, " %d %d",mode->Fs,mode->shortMdctSize*mode->nbShortMdcts);
+   }
+   fprintf(file, "\n   It contains static definitions for some pre-defined modes. */\n");
+   fprintf(file, "#include \"modes.h\"\n");
+   fprintf(file, "#include \"rate.h\"\n");
+
+   fprintf(file, "\n");
+
+   for (i=0;i<nb_modes;i++)
+   {
+      CELTMode *mode = modes[i];
+      int mdctSize;
+      int standard, framerate;
+
+      mdctSize = mode->shortMdctSize*mode->nbShortMdcts;
+      standard = (mode->Fs == 400*(opus_int32)mode->shortMdctSize);
+      framerate = mode->Fs/mode->shortMdctSize;
+
+      if (!standard)
+      {
+         fprintf(file, "#ifndef DEF_EBANDS%d_%d\n", mode->Fs, mdctSize);
+         fprintf(file, "#define DEF_EBANDS%d_%d\n", mode->Fs, mdctSize);
+         fprintf (file, "static const opus_int16 eBands%d_%d[%d] = {\n", mode->Fs, mdctSize, mode->nbEBands+2);
+         for (j=0;j<mode->nbEBands+2;j++)
+            fprintf (file, "%d, ", mode->eBands[j]);
+         fprintf (file, "};\n");
+         fprintf(file, "#endif\n");
+         fprintf(file, "\n");
+      }
+
+      fprintf(file, "#ifndef DEF_WINDOW%d\n", mode->overlap);
+      fprintf(file, "#define DEF_WINDOW%d\n", mode->overlap);
+      fprintf (file, "static const opus_val16 window%d[%d] = {\n", mode->overlap, mode->overlap);
+      for (j=0;j<mode->overlap;j++)
+         fprintf (file, WORD16 ",%c", mode->window[j],(j+6)%5==0?'\n':' ');
+      fprintf (file, "};\n");
+      fprintf(file, "#endif\n");
+      fprintf(file, "\n");
+
+      if (!standard)
+      {
+         fprintf(file, "#ifndef DEF_ALLOC_VECTORS%d_%d\n", mode->Fs, mdctSize);
+         fprintf(file, "#define DEF_ALLOC_VECTORS%d_%d\n", mode->Fs, mdctSize);
+         fprintf (file, "static const unsigned char allocVectors%d_%d[%d] = {\n", mode->Fs, mdctSize, mode->nbEBands*mode->nbAllocVectors);
+         for (j=0;j<mode->nbAllocVectors;j++)
+         {
+            for (k=0;k<mode->nbEBands;k++)
+               fprintf (file, "%2d, ", mode->allocVectors[j*mode->nbEBands+k]);
+            fprintf (file, "\n");
+         }
+         fprintf (file, "};\n");
+         fprintf(file, "#endif\n");
+         fprintf(file, "\n");
+      }
+
+      fprintf(file, "#ifndef DEF_LOGN%d\n", framerate);
+      fprintf(file, "#define DEF_LOGN%d\n", framerate);
+      fprintf (file, "static const opus_int16 logN%d[%d] = {\n", framerate, mode->nbEBands);
+      for (j=0;j<mode->nbEBands;j++)
+         fprintf (file, "%d, ", mode->logN[j]);
+      fprintf (file, "};\n");
+      fprintf(file, "#endif\n");
+      fprintf(file, "\n");
+
+      /* Pulse cache */
+      fprintf(file, "#ifndef DEF_PULSE_CACHE%d\n", mode->Fs/mdctSize);
+      fprintf(file, "#define DEF_PULSE_CACHE%d\n", mode->Fs/mdctSize);
+      fprintf (file, "static const opus_int16 cache_index%d[%d] = {\n", mode->Fs/mdctSize, (mode->maxLM+2)*mode->nbEBands);
+      for (j=0;j<mode->nbEBands*(mode->maxLM+2);j++)
+         fprintf (file, "%d,%c", mode->cache.index[j],(j+16)%15==0?'\n':' ');
+      fprintf (file, "};\n");
+      fprintf (file, "static const unsigned char cache_bits%d[%d] = {\n", mode->Fs/mdctSize, mode->cache.size);
+      for (j=0;j<mode->cache.size;j++)
+         fprintf (file, "%d,%c", mode->cache.bits[j],(j+16)%15==0?'\n':' ');
+      fprintf (file, "};\n");
+      fprintf (file, "static const unsigned char cache_caps%d[%d] = {\n", mode->Fs/mdctSize, (mode->maxLM+1)*2*mode->nbEBands);
+      for (j=0;j<(mode->maxLM+1)*2*mode->nbEBands;j++)
+         fprintf (file, "%d,%c", mode->cache.caps[j],(j+16)%15==0?'\n':' ');
+      fprintf (file, "};\n");
+
+      fprintf(file, "#endif\n");
+      fprintf(file, "\n");
+
+      /* FFT twiddles */
+      fprintf(file, "#ifndef FFT_TWIDDLES%d_%d\n", mode->Fs, mdctSize);
+      fprintf(file, "#define FFT_TWIDDLES%d_%d\n", mode->Fs, mdctSize);
+      fprintf (file, "static const kiss_twiddle_cpx fft_twiddles%d_%d[%d] = {\n",
+            mode->Fs, mdctSize, mode->mdct.kfft[0]->nfft);
+      for (j=0;j<mode->mdct.kfft[0]->nfft;j++)
+         fprintf (file, "{" WORD16 ", " WORD16 "},%c", mode->mdct.kfft[0]->twiddles[j].r, mode->mdct.kfft[0]->twiddles[j].i,(j+3)%2==0?'\n':' ');
+      fprintf (file, "};\n");
+
+      /* FFT Bitrev tables */
+      for (k=0;k<=mode->mdct.maxshift;k++)
+      {
+         fprintf(file, "#ifndef FFT_BITREV%d\n", mode->mdct.kfft[k]->nfft);
+         fprintf(file, "#define FFT_BITREV%d\n", mode->mdct.kfft[k]->nfft);
+         fprintf (file, "static const opus_int16 fft_bitrev%d[%d] = {\n",
+               mode->mdct.kfft[k]->nfft, mode->mdct.kfft[k]->nfft);
+         for (j=0;j<mode->mdct.kfft[k]->nfft;j++)
+            fprintf (file, "%d,%c", mode->mdct.kfft[k]->bitrev[j],(j+16)%15==0?'\n':' ');
+         fprintf (file, "};\n");
+
+         fprintf(file, "#endif\n");
+         fprintf(file, "\n");
+      }
+
+      /* FFT States */
+      for (k=0;k<=mode->mdct.maxshift;k++)
+      {
+         fprintf(file, "#ifndef FFT_STATE%d_%d_%d\n", mode->Fs, mdctSize, k);
+         fprintf(file, "#define FFT_STATE%d_%d_%d\n", mode->Fs, mdctSize, k);
+         fprintf (file, "static const kiss_fft_state fft_state%d_%d_%d = {\n",
+               mode->Fs, mdctSize, k);
+         fprintf (file, "%d,\t/* nfft */\n", mode->mdct.kfft[k]->nfft);
+#ifndef FIXED_POINT
+         fprintf (file, "%0.9ff,\t/* scale */\n", mode->mdct.kfft[k]->scale);
+#endif
+         fprintf (file, "%d,\t/* shift */\n", mode->mdct.kfft[k]->shift);
+         fprintf (file, "{");
+         for (j=0;j<2*MAXFACTORS;j++)
+            fprintf (file, "%d, ", mode->mdct.kfft[k]->factors[j]);
+         fprintf (file, "},\t/* factors */\n");
+         fprintf (file, "fft_bitrev%d,\t/* bitrev */\n", mode->mdct.kfft[k]->nfft);
+         fprintf (file, "fft_twiddles%d_%d,\t/* bitrev */\n", mode->Fs, mdctSize);
+         fprintf (file, "};\n");
+
+         fprintf(file, "#endif\n");
+         fprintf(file, "\n");
+      }
+
+      fprintf(file, "#endif\n");
+      fprintf(file, "\n");
+
+      /* MDCT twiddles */
+      fprintf(file, "#ifndef MDCT_TWIDDLES%d\n", mdctSize);
+      fprintf(file, "#define MDCT_TWIDDLES%d\n", mdctSize);
+      fprintf (file, "static const opus_val16 mdct_twiddles%d[%d] = {\n",
+            mdctSize, mode->mdct.n/4+1);
+      for (j=0;j<=mode->mdct.n/4;j++)
+         fprintf (file, WORD16 ",%c", mode->mdct.trig[j],(j+6)%5==0?'\n':' ');
+      fprintf (file, "};\n");
+
+      fprintf(file, "#endif\n");
+      fprintf(file, "\n");
+
+
+      /* Print the actual mode data */
+      fprintf(file, "static const CELTMode mode%d_%d_%d = {\n", mode->Fs, mdctSize, mode->overlap);
+      fprintf(file, INT32 ",\t/* Fs */\n", mode->Fs);
+      fprintf(file, "%d,\t/* overlap */\n", mode->overlap);
+      fprintf(file, "%d,\t/* nbEBands */\n", mode->nbEBands);
+      fprintf(file, "%d,\t/* effEBands */\n", mode->effEBands);
+      fprintf(file, "{");
+      for (j=0;j<4;j++)
+         fprintf(file, WORD16 ", ", mode->preemph[j]);
+      fprintf(file, "},\t/* preemph */\n");
+      if (standard)
+         fprintf(file, "eband5ms,\t/* eBands */\n");
+      else
+         fprintf(file, "eBands%d_%d,\t/* eBands */\n", mode->Fs, mdctSize);
+
+      fprintf(file, "%d,\t/* maxLM */\n", mode->maxLM);
+      fprintf(file, "%d,\t/* nbShortMdcts */\n", mode->nbShortMdcts);
+      fprintf(file, "%d,\t/* shortMdctSize */\n", mode->shortMdctSize);
+
+      fprintf(file, "%d,\t/* nbAllocVectors */\n", mode->nbAllocVectors);
+      if (standard)
+         fprintf(file, "band_allocation,\t/* allocVectors */\n");
+      else
+         fprintf(file, "allocVectors%d_%d,\t/* allocVectors */\n", mode->Fs, mdctSize);
+
+      fprintf(file, "logN%d,\t/* logN */\n", framerate);
+      fprintf(file, "window%d,\t/* window */\n", mode->overlap);
+      fprintf(file, "{%d, %d, {", mode->mdct.n, mode->mdct.maxshift);
+      for (k=0;k<=mode->mdct.maxshift;k++)
+         fprintf(file, "&fft_state%d_%d_%d, ", mode->Fs, mdctSize, k);
+      fprintf (file, "}, mdct_twiddles%d},\t/* mdct */\n", mdctSize);
+
+      fprintf(file, "{%d, cache_index%d, cache_bits%d, cache_caps%d},\t/* cache */\n",
+            mode->cache.size, mode->Fs/mdctSize, mode->Fs/mdctSize, mode->Fs/mdctSize);
+      fprintf(file, "};\n");
+   }
+   fprintf(file, "\n");
+   fprintf(file, "/* List of all the available modes */\n");
+   fprintf(file, "#define TOTAL_MODES %d\n", nb_modes);
+   fprintf(file, "static const CELTMode * const static_mode_list[TOTAL_MODES] = {\n");
+   for (i=0;i<nb_modes;i++)
+   {
+      CELTMode *mode = modes[i];
+      int mdctSize;
+      mdctSize = mode->shortMdctSize*mode->nbShortMdcts;
+      fprintf(file, "&mode%d_%d_%d,\n", mode->Fs, mdctSize, mode->overlap);
+   }
+   fprintf(file, "};\n");
+}
+
+void dump_header(FILE *file, CELTMode **modes, int nb_modes)
+{
+   int i;
+   int channels = 0;
+   int frame_size = 0;
+   int overlap = 0;
+   fprintf (file, "/* This header file is generated automatically*/\n");
+   for (i=0;i<nb_modes;i++)
+   {
+      CELTMode *mode = modes[i];
+      if (frame_size==0)
+         frame_size = mode->shortMdctSize*mode->nbShortMdcts;
+      else if (frame_size != mode->shortMdctSize*mode->nbShortMdcts)
+         frame_size = -1;
+      if (overlap==0)
+         overlap = mode->overlap;
+      else if (overlap != mode->overlap)
+         overlap = -1;
+   }
+   if (channels>0)
+   {
+      fprintf (file, "#define CHANNELS(mode) %d\n", channels);
+      if (channels==1)
+         fprintf (file, "#define DISABLE_STEREO\n");
+   }
+   if (frame_size>0)
+   {
+      fprintf (file, "#define FRAMESIZE(mode) %d\n", frame_size);
+   }
+   if (overlap>0)
+   {
+      fprintf (file, "#define OVERLAP(mode) %d\n", overlap);
+   }
+}
+
+#ifdef FIXED_POINT
+#define BASENAME "static_modes_fixed"
+#else
+#define BASENAME "static_modes_float"
+#endif
+
+int main(int argc, char **argv)
+{
+   int i, nb;
+   FILE *file;
+   CELTMode **m;
+   if (argc%2 != 1 || argc<3)
+   {
+      fprintf (stderr, "Usage: %s rate frame_size [rate frame_size] [rate frame_size]...\n",argv[0]);
+      return 1;
+   }
+   nb = (argc-1)/2;
+   m = malloc(nb*sizeof(CELTMode*));
+   for (i=0;i<nb;i++)
+   {
+      int Fs, frame;
+      Fs      = atoi(argv[2*i+1]);
+      frame   = atoi(argv[2*i+2]);
+      m[i] = opus_custom_mode_create(Fs, frame, NULL);
+      if (m[i]==NULL)
+      {
+         fprintf(stderr,"Error creating mode with Fs=%s, frame_size=%s\n",
+               argv[2*i+1],argv[2*i+2]);
+         return EXIT_FAILURE;
+      }
+   }
+   file = fopen(BASENAME ".h", "w");
+   dump_modes(file, m, nb);
+   fclose(file);
+   for (i=0;i<nb;i++)
+      opus_custom_mode_destroy(m[i]);
+   free(m);
+   return 0;
+}
diff --git a/celt/entdec.c b/celt/entdec.c
index 75e3e45..3c26468 100644
--- a/celt/entdec.c
+++ b/celt/entdec.c
@@ -85,7 +85,7 @@
    number=3,
    pages="256--294",
    month=Jul,
-   URL="http://www.stanford.edu/class/ee398/handouts/papers/Moffat98ArithmCoding.pdf"
+   URL="http://www.stanford.edu/class/ee398a/handouts/papers/Moffat98ArithmCoding.pdf"
   }*/
 
 static int ec_read_byte(ec_dec *_this){
diff --git a/celt/fixed_c5x.h b/celt/fixed_c5x.h
new file mode 100644
index 0000000..156ba45
--- /dev/null
+++ b/celt/fixed_c5x.h
@@ -0,0 +1,79 @@
+/* Copyright (C) 2003 Jean-Marc Valin */
+/**
+   @file fixed_c5x.h
+   @brief Fixed-point operations for the TI C5x DSP family
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_C5X_H
+#define FIXED_C5X_H
+
+#include "dsplib.h"
+
+#undef IMUL32
+static inline long IMUL32(long i, long j)
+{
+   long ac0, ac1;
+   ac0 = _lmpy(i>>16,j);
+   ac1 = ac0 + _lmpy(i,j>>16);
+   return _lmpyu(i,j) + (ac1<<16);
+}
+
+#undef MAX16
+#define MAX16(a,b) _max(a,b)
+
+#undef MIN16
+#define MIN16(a,b) _min(a,b)
+
+#undef MAX32
+#define MAX32(a,b) _lmax(a,b)
+
+#undef MIN32
+#define MIN32(a,b) _lmin(a,b)
+
+#undef VSHR32
+#define VSHR32(a, shift) _lshl(a,-(shift))
+
+#undef MULT16_16_Q15
+#define MULT16_16_Q15(a,b) (_smpy(a,b))
+
+#undef MULT16_16SU
+#define MULT16_16SU(a,b) _lmpysu(a,b)
+
+#undef MULT_16_16
+#define MULT_16_16(a,b) _lmpy(a,b)
+
+/* FIXME: This is technically incorrect and is bound to cause problems. Is there any cleaner solution? */
+#undef MULT16_32_Q15
+#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),(b)),15))
+
+#define celt_ilog2(x) (30 - _lnorm(x))
+#define OVERRIDE_CELT_ILOG2
+
+#define celt_maxabs16(x, len) MAX32(EXTEND32(maxval((DATA *)x, len)),-EXTEND32(minval((DATA *)x, len)))
+#define OVERRIDE_CELT_MAXABS16
+
+#endif /* FIXED_C5X_H */
diff --git a/celt/fixed_c6x.h b/celt/fixed_c6x.h
new file mode 100644
index 0000000..bb6ad92
--- /dev/null
+++ b/celt/fixed_c6x.h
@@ -0,0 +1,70 @@
+/* Copyright (C) 2008 CSIRO */
+/**
+   @file fixed_c6x.h
+   @brief Fixed-point operations for the TI C6x DSP family
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_C6X_H
+#define FIXED_C6X_H
+
+#undef MULT16_16SU
+#define MULT16_16SU(a,b) _mpysu(a,b)
+
+#undef MULT_16_16
+#define MULT_16_16(a,b) _mpy(a,b)
+
+#define celt_ilog2(x) (30 - _norm(x))
+#define OVERRIDE_CELT_ILOG2
+
+#undef MULT16_32_Q15
+#define MULT16_32_Q15(a,b) (_mpylill(a, b) >> 15)
+
+#if 0
+#include "dsplib.h"
+
+#undef MAX16
+#define MAX16(a,b) _max(a,b)
+
+#undef MIN16
+#define MIN16(a,b) _min(a,b)
+
+#undef MAX32
+#define MAX32(a,b) _lmax(a,b)
+
+#undef MIN32
+#define MIN32(a,b) _lmin(a,b)
+
+#undef VSHR32
+#define VSHR32(a, shift) _lshl(a,-(shift))
+
+#undef MULT16_16_Q15
+#define MULT16_16_Q15(a,b) (_smpy(a,b))
+
+#define celt_maxabs16(x, len) MAX32(EXTEND32(maxval((DATA *)x, len)),-EXTEND32(minval((DATA *)x, len)))
+#define OVERRIDE_CELT_MAXABS16
+
+#endif /* FIXED_C6X_H */
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index 71e28d6..657e67c 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -84,6 +84,8 @@
 #define PSHR(a,shift) (SHR((a)+((EXTEND32(1)<<((shift))>>1)),shift))
 #define SATURATE(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x)))
 
+#define SATURATE16(x) (EXTRACT16((x)>32767 ? 32767 : (x)<-32768 ? -32768 : (x)))
+
 /** Shift by a and round-to-neareast 32-bit value. Result is a 16-bit value */
 #define ROUND16(x,a) (EXTRACT16(PSHR32((x),(a))))
 /** Divide by two */
@@ -108,10 +110,13 @@
 
 /** 16x16 multiply-add where the result fits in 32 bits */
 #define MAC16_16(c,a,b) (ADD32((c),MULT16_16((a),(b))))
-/** 16x32 multiply-add, followed by a 15-bit shift right. Results fits in 32 bits */
+/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
+    b must fit in 31 bits.
+    Result fits in 32 bits. */
 #define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
 
 #define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11))
+#define MULT16_16_Q11(a,b) (SHR(MULT16_16((a),(b)),11))
 #define MULT16_16_Q13(a,b) (SHR(MULT16_16((a),(b)),13))
 #define MULT16_16_Q14(a,b) (SHR(MULT16_16((a),(b)),14))
 #define MULT16_16_Q15(a,b) (SHR(MULT16_16((a),(b)),15))
diff --git a/celt/kiss_fft.c b/celt/kiss_fft.c
index dcd6968..ad706c7 100644
--- a/celt/kiss_fft.c
+++ b/celt/kiss_fft.c
@@ -40,7 +40,6 @@
 #include "os_support.h"
 #include "mathops.h"
 #include "stack_alloc.h"
-#include "os_support.h"
 
 /* The guts header contains all the multiplication and addition macros that are defined for
    complex numbers.  It also delares the kf_ internal functions.
@@ -142,8 +141,6 @@ static void kf_bfly4(
          C_ADDTO(*Fout, scratch[1]);
          C_ADD( scratch[3] , scratch[0] , scratch[2] );
          C_SUB( scratch[4] , scratch[0] , scratch[2] );
-         Fout[m2].r = PSHR32(Fout[m2].r, 2);
-         Fout[m2].i = PSHR32(Fout[m2].i, 2);
          C_SUB( Fout[m2], *Fout, scratch[3] );
          tw1 += fstride;
          tw2 += fstride*2;
diff --git a/celt/mathops.c b/celt/mathops.c
index ce472c9..21fd942 100644
--- a/celt/mathops.c
+++ b/celt/mathops.c
@@ -123,6 +123,8 @@ opus_val32 celt_sqrt(opus_val32 x)
    static const opus_val16 C[5] = {23175, 11561, -3011, 1699, -664};
    if (x==0)
       return 0;
+   else if (x>=1073741824)
+      return 32767;
    k = (celt_ilog2(x)>>1)-7;
    x = VSHR32(x, 2*k);
    n = x-32768;
diff --git a/celt/mathops.h b/celt/mathops.h
index 4e97795..7e7d906 100644
--- a/celt/mathops.h
+++ b/celt/mathops.h
@@ -43,6 +43,41 @@
 
 unsigned isqrt32(opus_uint32 _val);
 
+#ifndef OVERRIDE_CELT_MAXABS16
+static inline opus_val32 celt_maxabs16(const opus_val16 *x, int len)
+{
+   int i;
+   opus_val16 maxval = 0;
+   opus_val16 minval = 0;
+   for (i=0;i<len;i++)
+   {
+      maxval = MAX16(maxval, x[i]);
+      minval = MIN16(minval, x[i]);
+   }
+   return MAX32(EXTEND32(maxval),-EXTEND32(minval));
+}
+#endif
+
+#ifndef OVERRIDE_CELT_MAXABS32
+#ifdef FIXED_POINT
+static inline opus_val32 celt_maxabs32(const opus_val32 *x, int len)
+{
+   int i;
+   opus_val32 maxval = 0;
+   opus_val32 minval = 0;
+   for (i=0;i<len;i++)
+   {
+      maxval = MAX32(maxval, x[i]);
+      minval = MIN32(minval, x[i]);
+   }
+   return MAX32(maxval, -minval);
+}
+#else
+#define celt_maxabs32(x,len) celt_maxabs16(x,len)
+#endif
+#endif
+
+
 #ifndef FIXED_POINT
 
 #define PI 3.141592653f
@@ -117,27 +152,6 @@ static inline opus_int16 celt_ilog2(opus_int32 x)
 }
 #endif
 
-#ifndef OVERRIDE_CELT_MAXABS16
-static inline opus_val16 celt_maxabs16(opus_val16 *x, int len)
-{
-   int i;
-   opus_val16 maxval = 0;
-   for (i=0;i<len;i++)
-      maxval = MAX16(maxval, ABS16(x[i]));
-   return maxval;
-}
-#endif
-
-#ifndef OVERRIDE_CELT_MAXABS32
-static inline opus_val32 celt_maxabs32(opus_val32 *x, int len)
-{
-   int i;
-   opus_val32 maxval = 0;
-   for (i=0;i<len;i++)
-      maxval = MAX32(maxval, ABS32(x[i]));
-   return maxval;
-}
-#endif
 
 /** Integer log in base2. Defined for zero, but not for negative numbers */
 static inline opus_int16 celt_zlog2(opus_val32 x)
@@ -151,6 +165,7 @@ opus_val32 celt_sqrt(opus_val32 x);
 
 opus_val16 celt_cos_norm(opus_val32 x);
 
+/** Base-2 logarithm approximation (log2(x)). (Q14 input, Q10 output) */
 static inline opus_val16 celt_log2(opus_val32 x)
 {
    int i;
@@ -176,6 +191,13 @@ static inline opus_val16 celt_log2(opus_val32 x)
 #define D1 22804
 #define D2 14819
 #define D3 10204
+
+static inline opus_val32 celt_exp2_frac(opus_val16 x)
+{
+   opus_val16 frac;
+   frac = SHL16(x, 4);
+   return ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac))))));
+}
 /** Base-2 exponential approximation (2^x). (Q10 input, Q16 output) */
 static inline opus_val32 celt_exp2(opus_val16 x)
 {
@@ -186,8 +208,7 @@ static inline opus_val32 celt_exp2(opus_val16 x)
       return 0x7f000000;
    else if (integer < -15)
       return 0;
-   frac = SHL16(x-SHL16(integer,10),4);
-   frac = ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac))))));
+   frac = celt_exp2_frac(x-SHL16(integer,10));
    return VSHR32(EXTEND32(frac), -integer-2);
 }
 
diff --git a/celt/mdct.c b/celt/mdct.c
index 16a36c6..90a214a 100644
--- a/celt/mdct.c
+++ b/celt/mdct.c
@@ -109,12 +109,14 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
    int N, N2, N4;
    kiss_twiddle_scalar sine;
    VARDECL(kiss_fft_scalar, f);
+   VARDECL(kiss_fft_scalar, f2);
    SAVE_STACK;
    N = l->n;
    N >>= shift;
    N2 = N>>1;
    N4 = N>>2;
    ALLOC(f, N2, kiss_fft_scalar);
+   ALLOC(f2, N2, kiss_fft_scalar);
    /* sin(x) ~= x here */
 #ifdef FIXED_POINT
    sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
@@ -131,7 +133,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
       kiss_fft_scalar * OPUS_RESTRICT yp = f;
       const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
       const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
-      for(i=0;i<(overlap>>2);i++)
+      for(i=0;i<((overlap+3)>>2);i++)
       {
          /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
          *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
@@ -143,7 +145,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
       }
       wp1 = window;
       wp2 = window+overlap-1;
-      for(;i<N4-(overlap>>2);i++)
+      for(;i<N4-((overlap+3)>>2);i++)
       {
          /* Real part arranged as a-bR, Imag part arranged as -c-dR */
          *yp++ = *xp2;
@@ -180,12 +182,12 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
    }
 
    /* N/4 complex FFT, down-scales by 4/N */
-   opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)in);
+   opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
 
    /* Post-rotate */
    {
       /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * OPUS_RESTRICT fp = in;
+      const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
       kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
       kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
       const kiss_twiddle_scalar *t = &l->trig[0];
@@ -212,14 +214,12 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
    int i;
    int N, N2, N4;
    kiss_twiddle_scalar sine;
-   VARDECL(kiss_fft_scalar, f);
    VARDECL(kiss_fft_scalar, f2);
    SAVE_STACK;
    N = l->n;
    N >>= shift;
    N2 = N>>1;
    N4 = N>>2;
-   ALLOC(f, N2, kiss_fft_scalar);
    ALLOC(f2, N2, kiss_fft_scalar);
    /* sin(x) ~= x here */
 #ifdef FIXED_POINT
@@ -249,81 +249,60 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
    }
 
    /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
-   opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)f);
+   opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
 
-   /* Post-rotate */
+   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
+      it in-place. */
    {
-      kiss_fft_scalar * OPUS_RESTRICT fp = f;
+      kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
       const kiss_twiddle_scalar *t = &l->trig[0];
-
-      for(i=0;i<N4;i++)
+      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
+         middle pair will be computed twice. */
+      for(i=0;i<(N4+1)>>1;i++)
       {
          kiss_fft_scalar re, im, yr, yi;
-         re = fp[0];
-         im = fp[1];
+         kiss_twiddle_scalar t0, t1;
+         re = yp0[0];
+         im = yp0[1];
+         t0 = t[i<<shift];
+         t1 = t[(N4-i)<<shift];
          /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]);
-         yi = S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]);
+         yr = S_MUL(re,t0) - S_MUL(im,t1);
+         yi = S_MUL(im,t0) + S_MUL(re,t1);
+         re = yp1[0];
+         im = yp1[1];
          /* works because the cos is nearly one */
-         *fp++ = yr - S_MUL(yi,sine);
-         *fp++ = yi + S_MUL(yr,sine);
-      }
-   }
-   /* De-shuffle the components for the middle of the window only */
-   {
-      const kiss_fft_scalar * OPUS_RESTRICT fp1 = f;
-      const kiss_fft_scalar * OPUS_RESTRICT fp2 = f+N2-1;
-      kiss_fft_scalar * OPUS_RESTRICT yp = f2;
-      for(i = 0; i < N4; i++)
-      {
-         *yp++ =-*fp1;
-         *yp++ = *fp2;
-         fp1 += 2;
-         fp2 -= 2;
+         yp0[0] = -(yr - S_MUL(yi,sine));
+         yp1[1] = yi + S_MUL(yr,sine);
+
+         t0 = t[(N4-i-1)<<shift];
+         t1 = t[(i+1)<<shift];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) - S_MUL(im,t1);
+         yi = S_MUL(im,t0) + S_MUL(re,t1);
+         /* works because the cos is nearly one */
+         yp1[0] = -(yr - S_MUL(yi,sine));
+         yp0[1] = yi + S_MUL(yr,sine);
+         yp0 += 2;
+         yp1 -= 2;
       }
    }
-   out -= (N2-overlap)>>1;
+
    /* Mirror on both sides for TDAC */
    {
-      kiss_fft_scalar * OPUS_RESTRICT fp1 = f2+N4-1;
-      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+N2-1;
-      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+N4-overlap/2;
-      const opus_val16 * OPUS_RESTRICT wp1 = window;
-      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
-      for(i = 0; i< N4-overlap/2; i++)
-      {
-         *xp1 = *fp1;
-         xp1--;
-         fp1--;
-      }
-      for(; i < N4; i++)
-      {
-         kiss_fft_scalar x1;
-         x1 = *fp1--;
-         *yp1++ +=-MULT16_32_Q15(*wp1, x1);
-         *xp1-- += MULT16_32_Q15(*wp2, x1);
-         wp1++;
-         wp2--;
-      }
-   }
-   {
-      kiss_fft_scalar * OPUS_RESTRICT fp2 = f2+N4;
-      kiss_fft_scalar * OPUS_RESTRICT xp2 = out+N2;
-      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+N-1-(N4-overlap/2);
+      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
       const opus_val16 * OPUS_RESTRICT wp1 = window;
       const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
-      for(i = 0; i< N4-overlap/2; i++)
-      {
-         *xp2 = *fp2;
-         xp2++;
-         fp2++;
-      }
-      for(; i < N4; i++)
+
+      for(i = 0; i < overlap/2; i++)
       {
-         kiss_fft_scalar x2;
-         x2 = *fp2++;
-         *yp2--  = MULT16_32_Q15(*wp1, x2);
-         *xp2++  = MULT16_32_Q15(*wp2, x2);
+         kiss_fft_scalar x1, x2;
+         x1 = *xp1;
+         x2 = *yp1;
+         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
+         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
          wp1++;
          wp2--;
       }
diff --git a/celt/modes.c b/celt/modes.c
index ed204d7..42e68e1 100644
--- a/celt/modes.c
+++ b/celt/modes.c
@@ -345,6 +345,14 @@ CELTMode *opus_custom_mode_create(opus_int32 Fs, int frame_size, int *error)
    mode->eBands = compute_ebands(Fs, mode->shortMdctSize, res, &mode->nbEBands);
    if (mode->eBands==NULL)
       goto failure;
+#if !defined(SMALL_FOOTPRINT)
+   /* Make sure we don't allocate a band larger than our PVQ table.
+      208 should be enough, but let's be paranoid. */
+   if ((mode->eBands[mode->nbEBands] - mode->eBands[mode->nbEBands-1])<<LM >
+    208) {
+       goto failure;
+   }
+#endif
 
    mode->effEBands = mode->nbEBands;
    while (mode->eBands[mode->effEBands] > mode->shortMdctSize)
diff --git a/celt/pitch.c b/celt/pitch.c
index ca0f523..0352b30 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -102,13 +102,57 @@ static void find_best_pitch(opus_val32 *xcorr, opus_val16 *y, int len,
    }
 }
 
+static void celt_fir5(const opus_val16 *x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         opus_val16 *mem)
+{
+   int i;
+   opus_val16 num0, num1, num2, num3, num4;
+   opus_val32 mem0, mem1, mem2, mem3, mem4;
+   num0=num[0];
+   num1=num[1];
+   num2=num[2];
+   num3=num[3];
+   num4=num[4];
+   mem0=mem[0];
+   mem1=mem[1];
+   mem2=mem[2];
+   mem3=mem[3];
+   mem4=mem[4];
+   for (i=0;i<N;i++)
+   {
+      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
+      sum = MAC16_16(sum,num0,mem0);
+      sum = MAC16_16(sum,num1,mem1);
+      sum = MAC16_16(sum,num2,mem2);
+      sum = MAC16_16(sum,num3,mem3);
+      sum = MAC16_16(sum,num4,mem4);
+      mem4 = mem3;
+      mem3 = mem2;
+      mem2 = mem1;
+      mem1 = mem0;
+      mem0 = x[i];
+      y[i] = ROUND16(sum, SIG_SHIFT);
+   }
+   mem[0]=mem0;
+   mem[1]=mem1;
+   mem[2]=mem2;
+   mem[3]=mem3;
+   mem[4]=mem4;
+}
+
+
 void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
       int len, int C)
 {
    int i;
    opus_val32 ac[5];
    opus_val16 tmp=Q15ONE;
-   opus_val16 lpc[4], mem[4]={0,0,0,0};
+   opus_val16 lpc[4], mem[5]={0,0,0,0,0};
+   opus_val16 lpc2[5];
+   opus_val16 c1 = QCONST16(.8f,15);
 #ifdef FIXED_POINT
    int shift;
    opus_val32 maxabs = celt_maxabs32(x[0], len);
@@ -161,14 +205,89 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
       tmp = MULT16_16_Q15(QCONST16(.9f,15), tmp);
       lpc[i] = MULT16_16_Q15(lpc[i], tmp);
    }
-   celt_fir(x_lp, lpc, x_lp, len>>1, 4, mem);
+   /* Add a zero */
+   lpc2[0] = lpc[0] + QCONST16(.8f,SIG_SHIFT);
+   lpc2[1] = lpc[1] + MULT16_16_Q15(c1,lpc[0]);
+   lpc2[2] = lpc[2] + MULT16_16_Q15(c1,lpc[1]);
+   lpc2[3] = lpc[3] + MULT16_16_Q15(c1,lpc[2]);
+   lpc2[4] = MULT16_16_Q15(c1,lpc[3]);
+   celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
+}
 
-   mem[0]=0;
-   lpc[0]=QCONST16(.8f,12);
-   celt_fir(x_lp, lpc, x_lp, len>>1, 1, mem);
+#if 0 /* This is a simple version of the pitch correlation that should work
+         well on DSPs like Blackfin and TI C5x/C6x */
 
+#ifdef FIXED_POINT
+opus_val32
+#else
+void
+#endif
+celt_pitch_xcorr(opus_val16 *x, opus_val16 *y, opus_val32 *xcorr, int len, int max_pitch)
+{
+   int i, j;
+#ifdef FIXED_POINT
+   opus_val32 maxcorr=1;
+#endif
+   for (i=0;i<max_pitch;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<len;j++)
+         sum = MAC16_16(sum, x[j],y[i+j]);
+      xcorr[i] = sum;
+#ifdef FIXED_POINT
+      maxcorr = MAX32(maxcorr, sum);
+#endif
+   }
+#ifdef FIXED_POINT
+   return maxcorr;
+#endif
 }
 
+#else /* Unrolled version of the pitch correlation -- runs faster on x86 and ARM */
+
+#ifdef FIXED_POINT
+opus_val32
+#else
+void
+#endif
+celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
+{
+   int i,j;
+#ifdef FIXED_POINT
+   opus_val32 maxcorr=1;
+#endif
+   for (i=0;i<max_pitch-3;i+=4)
+   {
+      opus_val32 sum[4]={0,0,0,0};
+      xcorr_kernel(_x, _y+i, sum, len);
+      xcorr[i]=sum[0];
+      xcorr[i+1]=sum[1];
+      xcorr[i+2]=sum[2];
+      xcorr[i+3]=sum[3];
+#ifdef FIXED_POINT
+      sum[0] = MAX32(sum[0], sum[1]);
+      sum[2] = MAX32(sum[2], sum[3]);
+      sum[0] = MAX32(sum[0], sum[2]);
+      maxcorr = MAX32(maxcorr, sum[0]);
+#endif
+   }
+   /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
+   for (;i<max_pitch;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<len;j++)
+         sum = MAC16_16(sum, _x[j],_y[i+j]);
+      xcorr[i] = sum;
+#ifdef FIXED_POINT
+      maxcorr = MAX32(maxcorr, sum);
+#endif
+   }
+#ifdef FIXED_POINT
+   return maxcorr;
+#endif
+}
+
+#endif
 void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
                   int len, int max_pitch, int *pitch)
 {
@@ -179,8 +298,8 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
    VARDECL(opus_val16, y_lp4);
    VARDECL(opus_val32, xcorr);
 #ifdef FIXED_POINT
-   opus_val32 maxcorr=1;
-   opus_val16 xmax, ymax;
+   opus_val32 maxcorr;
+   opus_val32 xmax, ymax;
    int shift=0;
 #endif
    int offset;
@@ -204,7 +323,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
 #ifdef FIXED_POINT
    xmax = celt_maxabs16(x_lp4, len>>2);
    ymax = celt_maxabs16(y_lp4, lag>>2);
-   shift = celt_ilog2(MAX16(1, MAX16(xmax, ymax)))-11;
+   shift = celt_ilog2(MAX32(1, MAX32(xmax, ymax)))-11;
    if (shift>0)
    {
       for (j=0;j<len>>2;j++)
@@ -220,16 +339,11 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
 
    /* Coarse search with 4x decimation */
 
-   for (i=0;i<max_pitch>>2;i++)
-   {
-      opus_val32 sum = 0;
-      for (j=0;j<len>>2;j++)
-         sum = MAC16_16(sum, x_lp4[j],y_lp4[i+j]);
-      xcorr[i] = MAX32(-1, sum);
 #ifdef FIXED_POINT
-      maxcorr = MAX32(maxcorr, sum);
+   maxcorr =
 #endif
-   }
+   celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);
+
    find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch
 #ifdef FIXED_POINT
                    , 0, maxcorr
@@ -287,11 +401,13 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
    int k, i, T, T0;
    opus_val16 g, g0;
    opus_val16 pg;
-   opus_val32 xy,xx,yy;
+   opus_val32 xy,xx,yy,xy2;
    opus_val32 xcorr[3];
    opus_val32 best_xy, best_yy;
    int offset;
    int minperiod0;
+   VARDECL(opus_val32, yy_lookup);
+   SAVE_STACK;
 
    minperiod0 = minperiod;
    maxperiod /= 2;
@@ -304,13 +420,16 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
       *T0_=maxperiod-1;
 
    T = T0 = *T0_;
-   xx=xy=yy=0;
-   for (i=0;i<N;i++)
+   ALLOC(yy_lookup, maxperiod+1, opus_val32);
+   dual_inner_prod(x, x, x-T0, N, &xx, &xy);
+   yy_lookup[0] = xx;
+   yy=xx;
+   for (i=1;i<=maxperiod;i++)
    {
-      xy = MAC16_16(xy, x[i], x[i-T0]);
-      xx = MAC16_16(xx, x[i], x[i]);
-      yy = MAC16_16(yy, x[i-T0],x[i-T0]);
+      yy = yy+MULT16_16(x[-i],x[-i])-MULT16_16(x[N-i],x[N-i]);
+      yy_lookup[i] = MAX32(0, yy);
    }
+   yy = yy_lookup[T0];
    best_xy = xy;
    best_yy = yy;
 #ifdef FIXED_POINT
@@ -331,6 +450,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
       int T1, T1b;
       opus_val16 g1;
       opus_val16 cont=0;
+      opus_val16 thresh;
       T1 = (2*T0+k)/(2*k);
       if (T1 < minperiod)
          break;
@@ -345,15 +465,9 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
       {
          T1b = (2*second_check[k]*T0+k)/(2*k);
       }
-      xy=yy=0;
-      for (i=0;i<N;i++)
-      {
-         xy = MAC16_16(xy, x[i], x[i-T1]);
-         yy = MAC16_16(yy, x[i-T1], x[i-T1]);
-
-         xy = MAC16_16(xy, x[i], x[i-T1b]);
-         yy = MAC16_16(yy, x[i-T1b], x[i-T1b]);
-      }
+      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
+      xy += xy2;
+      yy = yy_lookup[T1] + yy_lookup[T1b];
 #ifdef FIXED_POINT
       {
          opus_val32 x2y2;
@@ -372,7 +486,14 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
          cont = HALF32(prev_gain);
       else
          cont = 0;
-      if (g1 > QCONST16(.3f,15) + MULT16_16_Q15(QCONST16(.4f,15),g0)-cont)
+      thresh = MAX16(QCONST16(.3f,15), MULT16_16_Q15(QCONST16(.7f,15),g0)-cont);
+      /* Bias against very high pitch (very short period) to avoid false-positives
+         due to short-term correlation */
+      if (T1<3*minperiod)
+         thresh = MAX16(QCONST16(.4f,15), MULT16_16_Q15(QCONST16(.85f,15),g0)-cont);
+      else if (T1<2*minperiod)
+         thresh = MAX16(QCONST16(.5f,15), MULT16_16_Q15(QCONST16(.9f,15),g0)-cont);
+      if (g1 > thresh)
       {
          best_xy = xy;
          best_yy = yy;
@@ -406,5 +527,6 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
 
    if (*T0_<minperiod0)
       *T0_=minperiod0;
+   RESTORE_STACK;
    return pg;
 }
diff --git a/celt/pitch.h b/celt/pitch.h
index 2757071..caffd24 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -36,6 +36,10 @@
 
 #include "modes.h"
 
+#if defined(__SSE__) && !defined(FIXED_POINT)
+#include "x86/pitch_sse.h"
+#endif
+
 void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
       int len, int C);
 
@@ -45,4 +49,97 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
 opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
       int N, int *T0, int prev_period, opus_val16 prev_gain);
 
+/* OPT: This is the kernel you really want to optimize. It gets used a lot
+   by the prefilter and by the PLC. */
+#ifndef OVERRIDE_XCORR_KERNEL
+static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
+{
+   int j;
+   opus_val16 y_0, y_1, y_2, y_3;
+   y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
+   y_0=*y++;
+   y_1=*y++;
+   y_2=*y++;
+   for (j=0;j<len-3;j+=4)
+   {
+      opus_val16 tmp;
+      tmp = *x++;
+      y_3=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_0);
+      sum[1] = MAC16_16(sum[1],tmp,y_1);
+      sum[2] = MAC16_16(sum[2],tmp,y_2);
+      sum[3] = MAC16_16(sum[3],tmp,y_3);
+      tmp=*x++;
+      y_0=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_1);
+      sum[1] = MAC16_16(sum[1],tmp,y_2);
+      sum[2] = MAC16_16(sum[2],tmp,y_3);
+      sum[3] = MAC16_16(sum[3],tmp,y_0);
+      tmp=*x++;
+      y_1=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_2);
+      sum[1] = MAC16_16(sum[1],tmp,y_3);
+      sum[2] = MAC16_16(sum[2],tmp,y_0);
+      sum[3] = MAC16_16(sum[3],tmp,y_1);
+      tmp=*x++;
+      y_2=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_3);
+      sum[1] = MAC16_16(sum[1],tmp,y_0);
+      sum[2] = MAC16_16(sum[2],tmp,y_1);
+      sum[3] = MAC16_16(sum[3],tmp,y_2);
+   }
+   if (j++<len)
+   {
+      opus_val16 tmp = *x++;
+      y_3=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_0);
+      sum[1] = MAC16_16(sum[1],tmp,y_1);
+      sum[2] = MAC16_16(sum[2],tmp,y_2);
+      sum[3] = MAC16_16(sum[3],tmp,y_3);
+   }
+   if (j++<len)
+   {
+      opus_val16 tmp=*x++;
+      y_0=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_1);
+      sum[1] = MAC16_16(sum[1],tmp,y_2);
+      sum[2] = MAC16_16(sum[2],tmp,y_3);
+      sum[3] = MAC16_16(sum[3],tmp,y_0);
+   }
+   if (j<len)
+   {
+      opus_val16 tmp=*x++;
+      y_1=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_2);
+      sum[1] = MAC16_16(sum[1],tmp,y_3);
+      sum[2] = MAC16_16(sum[2],tmp,y_0);
+      sum[3] = MAC16_16(sum[3],tmp,y_1);
+   }
+}
+#endif /* OVERRIDE_XCORR_KERNEL */
+
+#ifndef OVERRIDE_DUAL_INNER_PROD
+static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   opus_val32 xy01=0;
+   opus_val32 xy02=0;
+   for (i=0;i<N;i++)
+   {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+#endif
+
+#ifdef FIXED_POINT
+opus_val32
+#else
+void
+#endif
+celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch);
+
 #endif
diff --git a/celt/quant_bands.c b/celt/quant_bands.c
index 66f1f5f..79685e1 100644
--- a/celt/quant_bands.c
+++ b/celt/quant_bands.c
@@ -40,8 +40,8 @@
 #include "rate.h"
 
 #ifdef FIXED_POINT
-/* Mean energy in each band quantized in Q6 */
-static const signed char eMeans[25] = {
+/* Mean energy in each band quantized in Q4 */
+const signed char eMeans[25] = {
       103,100, 92, 85, 81,
        77, 72, 70, 78, 75,
        73, 71, 78, 74, 69,
@@ -49,8 +49,8 @@ static const signed char eMeans[25] = {
        60, 60, 60, 60, 60
 };
 #else
-/* Mean energy in each band quantized in Q6 and converted back to float */
-static const opus_val16 eMeans[25] = {
+/* Mean energy in each band quantized in Q4 and converted back to float */
+const opus_val16 eMeans[25] = {
       6.437500f, 6.250000f, 5.750000f, 5.312500f, 5.062500f,
       4.812500f, 4.500000f, 4.375000f, 4.875000f, 4.687500f,
       4.562500f, 4.437500f, 4.875000f, 4.625000f, 4.312500f,
@@ -157,7 +157,7 @@ static int quant_coarse_energy_impl(const CELTMode *m, int start, int end,
       const opus_val16 *eBands, opus_val16 *oldEBands,
       opus_int32 budget, opus_int32 tell,
       const unsigned char *prob_model, opus_val16 *error, ec_enc *enc,
-      int C, int LM, int intra, opus_val16 max_decay)
+      int C, int LM, int intra, opus_val16 max_decay, int lfe)
 {
    int i, c;
    int badness = 0;
@@ -222,6 +222,8 @@ static int quant_coarse_energy_impl(const CELTMode *m, int start, int end,
             if (bits_left < 16)
                qi = IMAX(-1, qi);
          }
+         if (lfe && i>=2)
+            qi = IMIN(qi, 0);
          if (budget-tell >= 15)
          {
             int pi;
@@ -253,13 +255,13 @@ static int quant_coarse_energy_impl(const CELTMode *m, int start, int end,
          prev[c] = prev[c] + SHL32(q,7) - MULT16_16(beta,PSHR32(q,8));
       } while (++c < C);
    }
-   return badness;
+   return lfe ? 0 : badness;
 }
 
 void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
       const opus_val16 *eBands, opus_val16 *oldEBands, opus_uint32 budget,
       opus_val16 *error, ec_enc *enc, int C, int LM, int nbAvailableBytes,
-      int force_intra, opus_val32 *delayedIntra, int two_pass, int loss_rate)
+      int force_intra, opus_val32 *delayedIntra, int two_pass, int loss_rate, int lfe)
 {
    int intra;
    opus_val16 max_decay;
@@ -280,9 +282,6 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
    if (tell+3 > budget)
       two_pass = intra = 0;
 
-   /* Encode the global flags using a simple probability model
-      (first symbols in the stream) */
-
    max_decay = QCONST16(16.f,DB_SHIFT);
    if (end-start>10)
    {
@@ -292,6 +291,8 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
       max_decay = MIN32(max_decay, .125f*nbAvailableBytes);
 #endif
    }
+   if (lfe)
+      max_decay=3;
    enc_start_state = *enc;
 
    ALLOC(oldEBands_intra, C*m->nbEBands, opus_val16);
@@ -301,7 +302,7 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
    if (two_pass || intra)
    {
       badness1 = quant_coarse_energy_impl(m, start, end, eBands, oldEBands_intra, budget,
-            tell, e_prob_model[LM][1], error_intra, enc, C, LM, 1, max_decay);
+            tell, e_prob_model[LM][1], error_intra, enc, C, LM, 1, max_decay, lfe);
    }
 
    if (!intra)
@@ -328,7 +329,7 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
       *enc = enc_start_state;
 
       badness2 = quant_coarse_energy_impl(m, start, end, eBands, oldEBands, budget,
-            tell, e_prob_model[LM][intra], error, enc, C, LM, 0, max_decay);
+            tell, e_prob_model[LM][intra], error, enc, C, LM, 0, max_decay, lfe);
 
       if (two_pass && (badness1 < badness2 || (badness1 == badness2 && ((opus_int32)ec_tell_frac(enc))+intra_bias > tell_intra)))
       {
@@ -535,25 +536,6 @@ void unquant_energy_finalise(const CELTMode *m, int start, int end, opus_val16 *
    }
 }
 
-void log2Amp(const CELTMode *m, int start, int end,
-      celt_ener *eBands, const opus_val16 *oldEBands, int C)
-{
-   int c, i;
-   c=0;
-   do {
-      for (i=0;i<start;i++)
-         eBands[i+c*m->nbEBands] = 0;
-      for (;i<end;i++)
-      {
-         opus_val16 lg = ADD16(oldEBands[i+c*m->nbEBands],
-                         SHL16((opus_val16)eMeans[i],6));
-         eBands[i+c*m->nbEBands] = PSHR32(celt_exp2(lg),4);
-      }
-      for (;i<m->nbEBands;i++)
-         eBands[i+c*m->nbEBands] = 0;
-   } while (++c < C);
-}
-
 void amp2Log2(const CELTMode *m, int effEnd, int end,
       celt_ener *bandE, opus_val16 *bandLogE, int C)
 {
diff --git a/celt/quant_bands.h b/celt/quant_bands.h
index bec2855..0490bca 100644
--- a/celt/quant_bands.h
+++ b/celt/quant_bands.h
@@ -35,6 +35,12 @@
 #include "entdec.h"
 #include "mathops.h"
 
+#ifdef FIXED_POINT
+extern const signed char eMeans[25];
+#else
+extern const opus_val16 eMeans[25];
+#endif
+
 void amp2Log2(const CELTMode *m, int effEnd, int end,
       celt_ener *bandE, opus_val16 *bandLogE, int C);
 
@@ -45,7 +51,7 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
       const opus_val16 *eBands, opus_val16 *oldEBands, opus_uint32 budget,
       opus_val16 *error, ec_enc *enc, int C, int LM,
       int nbAvailableBytes, int force_intra, opus_val32 *delayedIntra,
-      int two_pass, int loss_rate);
+      int two_pass, int loss_rate, int lfe);
 
 void quant_fine_energy(const CELTMode *m, int start, int end, opus_val16 *oldEBands, opus_val16 *error, int *fine_quant, ec_enc *enc, int C);
 
diff --git a/celt/rate.c b/celt/rate.c
index 4e96787..e474cf5 100644
--- a/celt/rate.c
+++ b/celt/rate.c
@@ -248,7 +248,7 @@ void compute_pulse_cache(CELTMode *m, int LM)
 static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int skip_start,
       const int *bits1, const int *bits2, const int *thresh, const int *cap, opus_int32 total, opus_int32 *_balance,
       int skip_rsv, int *intensity, int intensity_rsv, int *dual_stereo, int dual_stereo_rsv, int *bits,
-      int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev)
+      int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth)
 {
    opus_int32 psum;
    int lo, hi;
@@ -353,7 +353,7 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int
 #ifdef FUZZING
             if ((rand()&0x1) == 0)
 #else
-            if (codedBands<=start+2 || band_bits > ((j<prev?7:9)*band_width<<LM<<BITRES)>>4)
+            if (codedBands<=start+2 || (band_bits > ((j<prev?7:9)*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
 #endif
             {
                ec_enc_bit_logp(ec, 1, 1);
@@ -524,7 +524,7 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int
 }
 
 int compute_allocation(const CELTMode *m, int start, int end, const int *offsets, const int *cap, int alloc_trim, int *intensity, int *dual_stereo,
-      opus_int32 total, opus_int32 *balance, int *pulses, int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev)
+      opus_int32 total, opus_int32 *balance, int *pulses, int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth)
 {
    int lo, hi, len, j;
    int codedBands;
@@ -631,7 +631,7 @@ int compute_allocation(const CELTMode *m, int start, int end, const int *offsets
    }
    codedBands = interp_bits2pulses(m, start, end, skip_start, bits1, bits2, thresh, cap,
          total, balance, skip_rsv, intensity, intensity_rsv, dual_stereo, dual_stereo_rsv,
-         pulses, ebits, fine_priority, C, LM, ec, encode, prev);
+         pulses, ebits, fine_priority, C, LM, ec, encode, prev, signalBandwidth);
    RESTORE_STACK;
    return codedBands;
 }
diff --git a/celt/rate.h b/celt/rate.h
index e0d5022..263fde9 100644
--- a/celt/rate.h
+++ b/celt/rate.h
@@ -96,6 +96,6 @@ static inline int pulses2bits(const CELTMode *m, int band, int LM, int pulses)
  @return Total number of bits allocated
 */
 int compute_allocation(const CELTMode *m, int start, int end, const int *offsets, const int *cap, int alloc_trim, int *intensity, int *dual_stero,
-      opus_int32 total, opus_int32 *balance, int *pulses, int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev);
+      opus_int32 total, opus_int32 *balance, int *pulses, int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth);
 
 #endif
diff --git a/celt/stack_alloc.h b/celt/stack_alloc.h
index a6f06d2..1c093a8 100644
--- a/celt/stack_alloc.h
+++ b/celt/stack_alloc.h
@@ -146,4 +146,26 @@ extern char *global_stack_top;
 
 #endif /* VAR_ARRAYS */
 
+
+#ifdef ENABLE_VALGRIND
+
+#include <valgrind/memcheck.h>
+#define OPUS_CHECK_ARRAY(ptr, len) VALGRIND_CHECK_MEM_IS_DEFINED(ptr, len*sizeof(*ptr))
+#define OPUS_CHECK_VALUE(value) VALGRIND_CHECK_VALUE_IS_DEFINED(value)
+#define OPUS_CHECK_ARRAY_COND(ptr, len) VALGRIND_CHECK_MEM_IS_DEFINED(ptr, len*sizeof(*ptr))
+#define OPUS_CHECK_VALUE_COND(value) VALGRIND_CHECK_VALUE_IS_DEFINED(value)
+#define OPUS_PRINT_INT(value) do {fprintf(stderr, #value " = %d at %s:%d\n", value, __FILE__, __LINE__);}while(0)
+#define OPUS_FPRINTF fprintf
+
+#else
+
+static inline int _opus_false(void) {return 0;}
+#define OPUS_CHECK_ARRAY(ptr, len) _opus_false()
+#define OPUS_CHECK_VALUE(value) _opus_false()
+#define OPUS_PRINT_INT(value) do{}while(0)
+#define OPUS_FPRINTF (void)
+
+#endif
+
+
 #endif /* STACK_ALLOC_H */
diff --git a/celt/tests/test_unit_cwrs32.c b/celt/tests/test_unit_cwrs32.c
index 4695f2d..ac2a8d1 100644
--- a/celt/tests/test_unit_cwrs32.c
+++ b/celt/tests/test_unit_cwrs32.c
@@ -53,22 +53,20 @@
 
 #ifdef TEST_CUSTOM_MODES
 
-#define NDIMS (46)
+#define NDIMS (44)
 static const int pn[NDIMS]={
    2,   3,   4,   5,   6,   7,   8,   9,  10,
   11,  12,  13,  14,  15,  16,  18,  20,  22,
   24,  26,  28,  30,  32,  36,  40,  44,  48,
   52,  56,  60,  64,  72,  80,  88,  96, 104,
- 112, 120, 128, 144, 160, 176, 192, 208, 224,
- 240
+ 112, 120, 128, 144, 160, 176, 192, 208
 };
 static const int pkmax[NDIMS]={
  128, 128, 128, 128,  88,  52,  36,  26,  22,
   18,  16,  15,  13,  12,  12,  11,  10,   9,
    9,   8,   8,   7,   7,   7,   7,   6,   6,
    6,   6,   6,   5,   5,   5,   5,   5,   5,
-   4,   4,   4,   4,   4,   4,   4,   4,   4,
-   4
+   4,   4,   4,   4,   4,   4,   4,   4
 };
 
 #else /* TEST_CUSTOM_MODES */
@@ -97,27 +95,37 @@ int main(void){
     for(pseudo=1;pseudo<41;pseudo++)
     {
       int k;
+#if defined(SMALL_FOOTPRINT)
       opus_uint32 uu[KMAX+2U];
+#endif
       opus_uint32 inc;
       opus_uint32 nc;
       opus_uint32 i;
       k=get_pulses(pseudo);
       if (k>pkmax[t])break;
       printf("Testing CWRS with N=%i, K=%i...\n",n,k);
+#if defined(SMALL_FOOTPRINT)
       nc=ncwrs_urow(n,k,uu);
+#else
+      nc=CELT_PVQ_V(n,k);
+#endif
       inc=nc/20000;
       if(inc<1)inc=1;
       for(i=0;i<nc;i+=inc){
+#if defined(SMALL_FOOTPRINT)
         opus_uint32 u[KMAX+2U];
-        int           y[NMAX];
-        int           sy;
-        int           yy[5];
+#endif
+        int         y[NMAX];
+        int         sy;
         opus_uint32 v;
         opus_uint32 ii;
-        int           kk;
-        int           j;
+        int         j;
+#if defined(SMALL_FOOTPRINT)
         memcpy(u,uu,(k+2U)*sizeof(*u));
         cwrsi(n,k,i,y,u);
+#else
+        cwrsi(n,k,i,y);
+#endif
         sy=0;
         for(j=0;j<n;j++)sy+=ABS(y[j]);
         if(sy!=k){
@@ -128,7 +136,12 @@ int main(void){
         /*printf("%6u of %u:",i,nc);
         for(j=0;j<n;j++)printf(" %+3i",y[j]);
         printf(" ->");*/
+#if defined(SMALL_FOOTPRINT)
         ii=icwrs(n,k,&v,y,u);
+#else
+        ii=icwrs(n,y);
+        v=CELT_PVQ_V(n,k);
+#endif
         if(ii!=i){
           fprintf(stderr,"Combination-index mismatch (%lu!=%lu).\n",
            (long)ii,(long)i);
@@ -139,81 +152,6 @@ int main(void){
            (long)v,(long)nc);
           return 2;
         }
-#ifndef SMALL_FOOTPRINT
-        if(n==2){
-          cwrsi2(k,i,yy);
-          for(j=0;j<2;j++)if(yy[j]!=y[j]){
-            fprintf(stderr,"N=2 pulse vector mismatch ({%i,%i}!={%i,%i}).\n",
-             yy[0],yy[1],y[0],y[1]);
-            return 3;
-          }
-          ii=icwrs2(yy,&kk);
-          if(ii!=i){
-            fprintf(stderr,"N=2 combination-index mismatch (%lu!=%lu).\n",
-             (long)ii,(long)i);
-            return 4;
-          }
-          if(kk!=k){
-            fprintf(stderr,"N=2 pulse count mismatch (%i,%i).\n",kk,k);
-            return 5;
-          }
-          v=ncwrs2(k);
-          if(v!=nc){
-            fprintf(stderr,"N=2 combination count mismatch (%lu,%lu).\n",
-             (long)v,(long)nc);
-            return 6;
-          }
-        }
-        else if(n==3){
-          cwrsi3(k,i,yy);
-          for(j=0;j<3;j++)if(yy[j]!=y[j]){
-            fprintf(stderr,"N=3 pulse vector mismatch "
-             "({%i,%i,%i}!={%i,%i,%i}).\n",yy[0],yy[1],yy[2],y[0],y[1],y[2]);
-            return 7;
-          }
-          ii=icwrs3(yy,&kk);
-          if(ii!=i){
-            fprintf(stderr,"N=3 combination-index mismatch (%lu!=%lu).\n",
-             (long)ii,(long)i);
-            return 8;
-          }
-          if(kk!=k){
-            fprintf(stderr,"N=3 pulse count mismatch (%i!=%i).\n",kk,k);
-            return 9;
-          }
-          v=ncwrs3(k);
-          if(v!=nc){
-            fprintf(stderr,"N=3 combination count mismatch (%lu!=%lu).\n",
-             (long)v,(long)nc);
-            return 10;
-          }
-        }
-        else if(n==4){
-          cwrsi4(k,i,yy);
-          for(j=0;j<4;j++)if(yy[j]!=y[j]){
-            fprintf(stderr,"N=4 pulse vector mismatch "
-             "({%i,%i,%i,%i}!={%i,%i,%i,%i}.\n",
-             yy[0],yy[1],yy[2],yy[3],y[0],y[1],y[2],y[3]);
-            return 11;
-          }
-          ii=icwrs4(yy,&kk);
-          if(ii!=i){
-            fprintf(stderr,"N=4 combination-index mismatch (%lu!=%lu).\n",
-             (long)ii,(long)i);
-            return 12;
-          }
-          if(kk!=k){
-            fprintf(stderr,"N=4 pulse count mismatch (%i!=%i).\n",kk,k);
-            return 13;
-          }
-          v=ncwrs4(k);
-          if(v!=nc){
-            fprintf(stderr,"N=4 combination count mismatch (%lu!=%lu).\n",
-             (long)v,(long)nc);
-            return 14;
-          }
-        }
-#endif /* SMALL_FOOTPRINT */
         /*printf(" %6u\n",i);*/
       }
       /*printf("\n");*/
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index c11f0ad..4bb780e 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -41,6 +41,8 @@
 #include "entdec.c"
 #include "entcode.c"
 #include "bands.c"
+#include "quant_bands.c"
+#include "laplace.c"
 #include "vq.c"
 #include "cwrs.c"
 #include <stdio.h>
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index f8fb9ac..ac8957f 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -151,6 +151,9 @@ void test1d(int nfft,int isinverse)
        for (k=0;k<nfft;++k)
           out[k] = 0;
        clt_mdct_backward(&cfg,in,out, window, nfft/2, 0, 1);
+       /* apply TDAC because clt_mdct_backward() no longer does that */
+       for (k=0;k<nfft/4;++k)
+          out[nfft-k-1] = out[nfft/2+k];
        check_inv(in,out,nfft,isinverse);
     } else {
        clt_mdct_forward(&cfg,in,out,window, nfft/2, 0, 1);
@@ -180,15 +183,27 @@ int main(int argc,char ** argv)
         test1d(256,1);
         test1d(512,0);
         test1d(512,1);
+        test1d(1024,0);
+        test1d(1024,1);
+        test1d(2048,0);
+        test1d(2048,1);
 #ifndef RADIX_TWO_ONLY
+        test1d(36,0);
+        test1d(36,1);
         test1d(40,0);
         test1d(40,1);
+        test1d(60,0);
+        test1d(60,1);
         test1d(120,0);
         test1d(120,1);
         test1d(240,0);
         test1d(240,1);
         test1d(480,0);
         test1d(480,1);
+        test1d(960,0);
+        test1d(960,1);
+        test1d(1920,0);
+        test1d(1920,1);
 #endif
     }
     return ret;
diff --git a/celt/vq.h b/celt/vq.h
index 1ceeeeb..ffdc69c 100644
--- a/celt/vq.h
+++ b/celt/vq.h
@@ -40,11 +40,9 @@
 /** Algebraic pulse-vector quantiser. The signal x is replaced by the sum of
   * the pitch and a combination of pulses such that its norm is still equal
   * to 1. This is the function that will typically require the most CPU.
- * @param x Residual signal to quantise/encode (returns quantised version)
- * @param W Perceptual weight to use when optimising (currently unused)
+ * @param X Residual signal to quantise/encode (returns quantised version)
  * @param N Number of samples to encode
  * @param K Number of pulses to use
- * @param p Pitch vector (it is assumed that p+x is a unit vector)
  * @param enc Entropy encoder state
  * @ret A mask indicating which blocks in the band received pulses
 */
@@ -56,10 +54,9 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B,
       );
 
 /** Algebraic pulse decoder
- * @param x Decoded normalised spectrum (returned)
+ * @param X Decoded normalised spectrum (returned)
  * @param N Number of samples to decode
  * @param K Number of pulses to use
- * @param p Pitch vector (automatically added to x)
  * @param dec Entropy decoder state
  * @ret A mask indicating which blocks in the band received pulses
  */
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
new file mode 100644
index 0000000..63ae3d4
--- /dev/null
+++ b/celt/x86/pitch_sse.h
@@ -0,0 +1,156 @@
+/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
+/**
+   @file pitch_sse.h
+   @brief Pitch analysis
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef PITCH_SSE_H
+#define PITCH_SSE_H
+
+#include <xmmintrin.h>
+#include "arch.h"
+
+#define OVERRIDE_XCORR_KERNEL
+static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
+{
+   int j;
+   __m128 xsum1, xsum2;
+   xsum1 = _mm_loadu_ps(sum);
+   xsum2 = _mm_setzero_ps();
+
+   for (j = 0; j < len-3; j += 4)
+   {
+      __m128 x0 = _mm_loadu_ps(x+j);
+      __m128 yj = _mm_loadu_ps(y+j);
+      __m128 y3 = _mm_loadu_ps(y+j+3);
+
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
+                                          _mm_shuffle_ps(yj,y3,0x49)));
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
+                                          _mm_shuffle_ps(yj,y3,0x9e)));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
+   }
+   if (j < len)
+   {
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+      if (++j < len)
+      {
+         xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+         if (++j < len)
+         {
+            xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+         }
+      }
+   }
+   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
+}
+
+#define OVERRIDE_DUAL_INNER_PROD
+static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   __m128 xsum1, xsum2;
+   xsum1 = _mm_setzero_ps();
+   xsum2 = _mm_setzero_ps();
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 xi = _mm_loadu_ps(x+i);
+      __m128 y1i = _mm_loadu_ps(y01+i);
+      __m128 y2i = _mm_loadu_ps(y02+i);
+      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
+      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
+   }
+   /* Horizontal sum */
+   xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
+   xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
+   _mm_store_ss(xy1, xsum1);
+   xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
+   xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
+   _mm_store_ss(xy2, xsum2);
+   for (;i<N;i++)
+   {
+      *xy1 = MAC16_16(*xy1, x[i], y01[i]);
+      *xy2 = MAC16_16(*xy2, x[i], y02[i]);
+   }
+}
+
+#define OVERRIDE_COMB_FILTER_CONST
+static inline void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   int i;
+   __m128 x0v;
+   __m128 g10v, g11v, g12v;
+   g10v = _mm_load1_ps(&g10);
+   g11v = _mm_load1_ps(&g11);
+   g12v = _mm_load1_ps(&g12);
+   x0v = _mm_loadu_ps(&x[-T-2]);
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 yi, yi2, x1v, x2v, x3v, x4v;
+      const opus_val32 *xp = &x[i-T-2];
+      yi = _mm_loadu_ps(x+i);
+      x4v = _mm_loadu_ps(xp+4);
+#if 0
+      /* Slower version with all loads */
+      x1v = _mm_loadu_ps(xp+1);
+      x2v = _mm_loadu_ps(xp+2);
+      x3v = _mm_loadu_ps(xp+3);
+#else
+      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
+      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
+      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
+#endif
+
+      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
+      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
+      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#else
+      /* Use partial sums */
+      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
+                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+      yi = _mm_add_ps(yi, yi2);
+#endif
+      x0v=x4v;
+      _mm_storeu_ps(y+i, yi);
+   }
+#ifdef CUSTOM_MODES
+   for (;i<N;i++)
+   {
+      y[i] = x[i]
+               + MULT16_32_Q15(g10,x[i-T])
+               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
+   }
+#endif
+}
+
+#endif
author	tlegrand@chromium.org <tlegrand@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-10-23 09:13:50 +0000
committer	tlegrand@chromium.org <tlegrand@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-10-23 09:13:50 +0000
commit	e3ea049fcaee2247e45f0ce793d4313babb4ef69 (patch)
tree	4449fa158c45dea4175443f44c3dfc1264bd6dfb /celt
parent	6b6bee25314cfac02cc555cddedb9680c63a26d6 (diff)
download	src-e3ea049fcaee2247e45f0ce793d4313babb4ef69.tar.gz