From 3c3902f0ac13428394f14f78f0fab05ef3468d69 Mon Sep 17 00:00:00 2001 From: "tlegrand@google.com" Date: Mon, 9 Dec 2013 08:35:25 +0000 Subject: Updating Opus to release 1.1 opus-1.1.tar.gz downloaded from http://www.opus-codec.org/ R=sergeyu@chromium.org Review URL: https://codereview.chromium.org/107243004 git-svn-id: svn://svn.chromium.org/chrome/trunk/deps/third_party/opus@239448 0039d316-1c4b-4281-b951-d872f2087c98 --- celt/_kiss_fft_guts.h | 4 +- celt/arch.h | 7 +- celt/arm/arm_celt_map.c | 49 ++++ celt/arm/armcpu.c | 18 +- celt/arm/armcpu.h | 42 +++- celt/arm/armopts.s.in | 37 +++ celt/arm/celt_pitch_xcorr_arm.s | 545 ++++++++++++++++++++++++++++++++++++++++ celt/arm/fixed_armv4.h | 4 +- celt/arm/fixed_armv5e.h | 10 +- celt/arm/pitch_arm.h | 57 +++++ celt/bands.c | 28 ++- celt/celt.h | 15 +- celt/celt_decoder.c | 18 +- celt/celt_encoder.c | 102 +++++--- celt/celt_lpc.c | 5 +- celt/celt_lpc.h | 3 +- celt/cpu_support.h | 7 +- celt/cwrs.c | 12 +- celt/ecintrin.h | 2 +- celt/entcode.h | 9 +- celt/fixed_debug.h | 68 ++--- celt/fixed_generic.h | 2 +- celt/float_cast.h | 4 +- celt/mathops.c | 2 +- celt/mathops.h | 22 +- celt/os_support.h | 9 +- celt/pitch.c | 15 +- celt/pitch.h | 38 ++- celt/quant_bands.c | 6 +- celt/rate.c | 2 +- celt/rate.h | 6 +- celt/stack_alloc.h | 9 +- celt/x86/pitch_sse.h | 6 +- 33 files changed, 999 insertions(+), 164 deletions(-) create mode 100644 celt/arm/arm_celt_map.c create mode 100644 celt/arm/armopts.s.in create mode 100644 celt/arm/celt_pitch_xcorr_arm.s create mode 100644 celt/arm/pitch_arm.h (limited to 'celt') diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h index f0c6976..aefe490 100644 --- a/celt/_kiss_fft_guts.h +++ b/celt/_kiss_fft_guts.h @@ -94,11 +94,11 @@ do {(res).r = ADD32((res).r,(a).r); (res).i = SUB32((res).i,(a).i); \ }while(0) -#if defined(ARMv4_ASM) +#if defined(OPUS_ARM_INLINE_ASM) #include "arm/kiss_fft_armv4.h" #endif -#if defined(ARMv5E_ASM) +#if defined(OPUS_ARM_INLINE_EDSP) #include "arm/kiss_fft_armv5e.h" #endif diff --git a/celt/arch.h b/celt/arch.h index f9c9856..3bbcd36 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -35,6 +35,7 @@ #define ARCH_H #include "opus_types.h" +#include "opus_defines.h" # if !defined(__GNUC_PREREQ) # if defined(__GNUC__)&&defined(__GNUC_MINOR__) @@ -54,7 +55,7 @@ #ifdef __GNUC__ __attribute__((noreturn)) #endif -static inline void _celt_fatal(const char *str, const char *file, int line) +static OPUS_INLINE void _celt_fatal(const char *str, const char *file, int line) { fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str); abort(); @@ -113,9 +114,9 @@ typedef opus_val32 celt_ener; #include "fixed_generic.h" -#ifdef ARMv5E_ASM +#ifdef OPUS_ARM_INLINE_EDSP #include "arm/fixed_armv5e.h" -#elif defined (ARMv4_ASM) +#elif defined (OPUS_ARM_INLINE_ASM) #include "arm/fixed_armv4.h" #elif defined (BFIN_ASM) #include "fixed_bfin.h" diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c new file mode 100644 index 0000000..547a84d --- /dev/null +++ b/celt/arm/arm_celt_map.c @@ -0,0 +1,49 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pitch.h" + +#if defined(OPUS_HAVE_RTCD) + +# if defined(FIXED_POINT) +opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val32 *, int , int) = { + celt_pitch_xcorr_c, /* ARMv4 */ + MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */ + MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */ + MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */ +}; +# else +# error "Floating-point implementation is not supported by ARM asm yet." \ + "Reconfigure with --disable-rtcd or send patches." +# endif + +#endif diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c index aabcc71..1768525 100644 --- a/celt/arm/armcpu.c +++ b/celt/arm/armcpu.c @@ -49,13 +49,13 @@ # define WIN32_EXTRA_LEAN # include -static inline opus_uint32 opus_cpu_capabilities(void){ +static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ opus_uint32 flags; flags=0; - /* MSVC has no inline __asm support for ARM, but it does let you __emit + /* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit * instructions via their assembled hex code. * All of these instructions should be essentially nops. */ -# if defined(ARMv5E_ASM) +# if defined(OPUS_ARM_MAY_HAVE_EDSP) __try{ /*PLD [r13]*/ __emit(0xF5DDF000); @@ -64,7 +64,7 @@ static inline opus_uint32 opus_cpu_capabilities(void){ __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ /*Ignore exception.*/ } -# if defined(ARMv6E_ASM) +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) __try{ /*SHADD8 r3,r3,r3*/ __emit(0xE6333F93); @@ -73,7 +73,7 @@ static inline opus_uint32 opus_cpu_capabilities(void){ __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ /*Ignore exception.*/ } -# if defined(ARM_HAVE_NEON) +# if defined(OPUS_ARM_MAY_HAVE_NEON) __try{ /*VORR q0,q0,q0*/ __emit(0xF2200150); @@ -107,19 +107,26 @@ opus_uint32 opus_cpu_capabilities(void) while(fgets(buf, 512, cpuinfo) != NULL) { +# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON) /* Search for edsp and neon flag */ if(memcmp(buf, "Features", 8) == 0) { char *p; +# if defined(OPUS_ARM_MAY_HAVE_EDSP) p = strstr(buf, " edsp"); if(p != NULL && (p[5] == ' ' || p[5] == '\n')) flags |= OPUS_CPU_ARM_EDSP; +# endif +# if defined(OPUS_ARM_MAY_HAVE_NEON) p = strstr(buf, " neon"); if(p != NULL && (p[5] == ' ' || p[5] == '\n')) flags |= OPUS_CPU_ARM_NEON; +# endif } +# endif +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) /* Search for media capabilities (>= ARMv6) */ if(memcmp(buf, "CPU architecture:", 17) == 0) { @@ -129,6 +136,7 @@ opus_uint32 opus_cpu_capabilities(void) if(version >= 6) flags |= OPUS_CPU_ARM_MEDIA; } +# endif } fclose(cpuinfo); diff --git a/celt/arm/armcpu.h b/celt/arm/armcpu.h index 68d80fe..ac57446 100644 --- a/celt/arm/armcpu.h +++ b/celt/arm/armcpu.h @@ -25,11 +25,47 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* Original code from libtheora modified to suit to Opus */ +#if !defined(ARMCPU_H) +# define ARMCPU_H -#ifndef ARMCPU_H -#define ARMCPU_H +# if defined(OPUS_ARM_MAY_HAVE_EDSP) +# define MAY_HAVE_EDSP(name) name ## _edsp +# else +# define MAY_HAVE_EDSP(name) name ## _c +# endif +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) +# define MAY_HAVE_MEDIA(name) name ## _media +# else +# define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name) +# endif + +# if defined(OPUS_ARM_MAY_HAVE_NEON) +# define MAY_HAVE_NEON(name) name ## _neon +# else +# define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name) +# endif + +# if defined(OPUS_ARM_PRESUME_EDSP) +# define PRESUME_EDSP(name) name ## _edsp +# else +# define PRESUME_EDSP(name) name ## _c +# endif + +# if defined(OPUS_ARM_PRESUME_MEDIA) +# define PRESUME_MEDIA(name) name ## _media +# else +# define PRESUME_MEDIA(name) PRESUME_EDSP(name) +# endif + +# if defined(OPUS_ARM_PRESUME_NEON) +# define PRESUME_NEON(name) name ## _neon +# else +# define PRESUME_NEON(name) PRESUME_MEDIA(name) +# endif + +# if defined(OPUS_HAVE_RTCD) int opus_select_arch(void); +# endif #endif diff --git a/celt/arm/armopts.s.in b/celt/arm/armopts.s.in new file mode 100644 index 0000000..3d8aaf2 --- /dev/null +++ b/celt/arm/armopts.s.in @@ -0,0 +1,37 @@ +/* Copyright (C) 2013 Mozilla Corporation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +; Set the following to 1 if we have EDSP instructions +; (LDRD/STRD, etc., ARMv5E and later). +OPUS_ARM_MAY_HAVE_EDSP * @OPUS_ARM_MAY_HAVE_EDSP@ + +; Set the following to 1 if we have ARMv6 media instructions. +OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@ + +; Set the following to 1 if we have NEON (some ARMv7) +OPUS_ARM_MAY_HAVE_NEON * @OPUS_ARM_MAY_HAVE_NEON@ + +END diff --git a/celt/arm/celt_pitch_xcorr_arm.s b/celt/arm/celt_pitch_xcorr_arm.s new file mode 100644 index 0000000..09917b1 --- /dev/null +++ b/celt/arm/celt_pitch_xcorr_arm.s @@ -0,0 +1,545 @@ +; Copyright (c) 2007-2008 CSIRO +; Copyright (c) 2007-2009 Xiph.Org Foundation +; Copyright (c) 2013 Parrot +; Written by Aurélien Zanelli +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; - Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; - Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + AREA |.text|, CODE, READONLY + + GET celt/arm/armopts.s + +IF OPUS_ARM_MAY_HAVE_EDSP + EXPORT celt_pitch_xcorr_edsp +ENDIF + +IF OPUS_ARM_MAY_HAVE_NEON + EXPORT celt_pitch_xcorr_neon +ENDIF + +IF OPUS_ARM_MAY_HAVE_NEON + +; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 +xcorr_kernel_neon PROC + ; input: + ; r3 = int len + ; r4 = opus_val16 *x + ; r5 = opus_val16 *y + ; q0 = opus_val32 sum[4] + ; output: + ; q0 = opus_val32 sum[4] + ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 + ; internal usage: + ; r12 = int j + ; d3 = y_3|y_2|y_1|y_0 + ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 + ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 + ; q8 = scratch + ; + ; Load y[0...3] + ; This requires len>0 to always be valid (which we assert in the C code). + VLD1.16 {d5}, [r5]! + SUBS r12, r3, #8 + BLE xcorr_kernel_neon_process4 +; Process 8 samples at a time. +; This loop loads one y value more than we actually need. Therefore we have to +; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid +; reading past the end of the array. +xcorr_kernel_neon_process8 + ; This loop has 19 total instructions (10 cycles to issue, minimum), with + ; - 2 cycles of ARM insrtuctions, + ; - 10 cycles of load/store/byte permute instructions, and + ; - 9 cycles of data processing instructions. + ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the + ; latter two categories, meaning the whole loop should run in 10 cycles per + ; iteration, barring cache misses. + ; + ; Load x[0...7] + VLD1.16 {d6, d7}, [r4]! + ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get + ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. + VAND d3, d5, d5 + SUBS r12, r12, #8 + ; Load y[4...11] + VLD1.16 {d4, d5}, [r5]! + VMLAL.S16 q0, d3, d6[0] + VEXT.16 d16, d3, d4, #1 + VMLAL.S16 q0, d4, d7[0] + VEXT.16 d17, d4, d5, #1 + VMLAL.S16 q0, d16, d6[1] + VEXT.16 d16, d3, d4, #2 + VMLAL.S16 q0, d17, d7[1] + VEXT.16 d17, d4, d5, #2 + VMLAL.S16 q0, d16, d6[2] + VEXT.16 d16, d3, d4, #3 + VMLAL.S16 q0, d17, d7[2] + VEXT.16 d17, d4, d5, #3 + VMLAL.S16 q0, d16, d6[3] + VMLAL.S16 q0, d17, d7[3] + BGT xcorr_kernel_neon_process8 +; Process 4 samples here if we have > 4 left (still reading one extra y value). +xcorr_kernel_neon_process4 + ADDS r12, r12, #4 + BLE xcorr_kernel_neon_process2 + ; Load x[0...3] + VLD1.16 d6, [r4]! + ; Use VAND since it's a data processing instruction again. + VAND d4, d5, d5 + SUB r12, r12, #4 + ; Load y[4...7] + VLD1.16 d5, [r5]! + VMLAL.S16 q0, d4, d6[0] + VEXT.16 d16, d4, d5, #1 + VMLAL.S16 q0, d16, d6[1] + VEXT.16 d16, d4, d5, #2 + VMLAL.S16 q0, d16, d6[2] + VEXT.16 d16, d4, d5, #3 + VMLAL.S16 q0, d16, d6[3] +; Process 2 samples here if we have > 2 left (still reading one extra y value). +xcorr_kernel_neon_process2 + ADDS r12, r12, #2 + BLE xcorr_kernel_neon_process1 + ; Load x[0...1] + VLD2.16 {d6[],d7[]}, [r4]! + ; Use VAND since it's a data processing instruction again. + VAND d4, d5, d5 + SUB r12, r12, #2 + ; Load y[4...5] + VLD1.32 {d5[]}, [r5]! + VMLAL.S16 q0, d4, d6 + VEXT.16 d16, d4, d5, #1 + ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI + ; instead of VEXT, since it's a data-processing instruction. + VSRI.64 d5, d4, #32 + VMLAL.S16 q0, d16, d7 +; Process 1 sample using the extra y value we loaded above. +xcorr_kernel_neon_process1 + ; Load next *x + VLD1.16 {d6[]}, [r4]! + ADDS r12, r12, #1 + ; y[0...3] are left in d5 from prior iteration(s) (if any) + VMLAL.S16 q0, d5, d6 + MOVLE pc, lr +; Now process 1 last sample, not reading ahead. + ; Load last *y + VLD1.16 {d4[]}, [r5]! + VSRI.64 d4, d5, #16 + ; Load last *x + VLD1.16 {d6[]}, [r4]! + VMLAL.S16 q0, d4, d6 + MOV pc, lr + ENDP + +; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, +; opus_val32 *xcorr, int len, int max_pitch) +celt_pitch_xcorr_neon PROC + ; input: + ; r0 = opus_val16 *_x + ; r1 = opus_val16 *_y + ; r2 = opus_val32 *xcorr + ; r3 = int len + ; output: + ; r0 = int maxcorr + ; internal usage: + ; r4 = opus_val16 *x (for xcorr_kernel_neon()) + ; r5 = opus_val16 *y (for xcorr_kernel_neon()) + ; r6 = int max_pitch + ; r12 = int j + ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) + STMFD sp!, {r4-r6, lr} + LDR r6, [sp, #16] + VMOV.S32 q15, #1 + ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done + SUBS r6, r6, #4 + BLT celt_pitch_xcorr_neon_process4_done +celt_pitch_xcorr_neon_process4 + ; xcorr_kernel_neon parameters: + ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} + MOV r4, r0 + MOV r5, r1 + VEOR q0, q0, q0 + ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. + ; So we don't save/restore any other registers. + BL xcorr_kernel_neon + SUBS r6, r6, #4 + VST1.32 {q0}, [r2]! + ; _y += 4 + ADD r1, r1, #8 + VMAX.S32 q15, q15, q0 + ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done + BGE celt_pitch_xcorr_neon_process4 +; We have less than 4 sums left to compute. +celt_pitch_xcorr_neon_process4_done + ADDS r6, r6, #4 + ; Reduce maxcorr to a single value + VMAX.S32 d30, d30, d31 + VPMAX.S32 d30, d30, d30 + ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done + BLE celt_pitch_xcorr_neon_done +; Now compute each remaining sum one at a time. +celt_pitch_xcorr_neon_process_remaining + MOV r4, r0 + MOV r5, r1 + VMOV.I32 q0, #0 + SUBS r12, r3, #8 + BLT celt_pitch_xcorr_neon_process_remaining4 +; Sum terms 8 at a time. +celt_pitch_xcorr_neon_process_remaining_loop8 + ; Load x[0...7] + VLD1.16 {q1}, [r4]! + ; Load y[0...7] + VLD1.16 {q2}, [r5]! + SUBS r12, r12, #8 + VMLAL.S16 q0, d4, d2 + VMLAL.S16 q0, d5, d3 + BGE celt_pitch_xcorr_neon_process_remaining_loop8 +; Sum terms 4 at a time. +celt_pitch_xcorr_neon_process_remaining4 + ADDS r12, r12, #4 + BLT celt_pitch_xcorr_neon_process_remaining4_done + ; Load x[0...3] + VLD1.16 {d2}, [r4]! + ; Load y[0...3] + VLD1.16 {d3}, [r5]! + SUB r12, r12, #4 + VMLAL.S16 q0, d3, d2 +celt_pitch_xcorr_neon_process_remaining4_done + ; Reduce the sum to a single value. + VADD.S32 d0, d0, d1 + VPADDL.S32 d0, d0 + ADDS r12, r12, #4 + BLE celt_pitch_xcorr_neon_process_remaining_loop_done +; Sum terms 1 at a time. +celt_pitch_xcorr_neon_process_remaining_loop1 + VLD1.16 {d2[]}, [r4]! + VLD1.16 {d3[]}, [r5]! + SUBS r12, r12, #1 + VMLAL.S16 q0, d2, d3 + BGT celt_pitch_xcorr_neon_process_remaining_loop1 +celt_pitch_xcorr_neon_process_remaining_loop_done + VST1.32 {d0[0]}, [r2]! + VMAX.S32 d30, d30, d0 + SUBS r6, r6, #1 + ; _y++ + ADD r1, r1, #2 + ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining + BGT celt_pitch_xcorr_neon_process_remaining +celt_pitch_xcorr_neon_done + VMOV.32 r0, d30[0] + LDMFD sp!, {r4-r6, pc} + ENDP + +ENDIF + +IF OPUS_ARM_MAY_HAVE_EDSP + +; This will get used on ARMv7 devices without NEON, so it has been optimized +; to take advantage of dual-issuing where possible. +xcorr_kernel_edsp PROC + ; input: + ; r3 = int len + ; r4 = opus_val16 *_x (must be 32-bit aligned) + ; r5 = opus_val16 *_y (must be 32-bit aligned) + ; r6...r9 = opus_val32 sum[4] + ; output: + ; r6...r9 = opus_val32 sum[4] + ; preserved: r0-r5 + ; internal usage + ; r2 = int j + ; r12,r14 = opus_val16 x[4] + ; r10,r11 = opus_val16 y[4] + STMFD sp!, {r2,r4,r5,lr} + LDR r10, [r5], #4 ; Load y[0...1] + SUBS r2, r3, #4 ; j = len-4 + LDR r11, [r5], #4 ; Load y[2...3] + BLE xcorr_kernel_edsp_process4_done + LDR r12, [r4], #4 ; Load x[0...1] + ; Stall +xcorr_kernel_edsp_process4 + ; The multiplies must issue from pipeline 0, and can't dual-issue with each + ; other. Every other instruction here dual-issues with a multiply, and is + ; thus "free". There should be no stalls in the body of the loop. + SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0) + LDR r14, [r4], #4 ; Load x[2...3] + SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1) + SUBS r2, r2, #4 ; j-=4 + SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2) + SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3) + SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1) + LDR r10, [r5], #4 ; Load y[4...5] + SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2) + SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3) + SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4) + LDRGT r12, [r4], #4 ; Load x[0...1] + SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2) + SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3) + SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4) + SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5) + SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3) + LDR r11, [r5], #4 ; Load y[6...7] + SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4) + SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5) + SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6) + BGT xcorr_kernel_edsp_process4 +xcorr_kernel_edsp_process4_done + ADDS r2, r2, #4 + BLE xcorr_kernel_edsp_done + LDRH r12, [r4], #2 ; r12 = *x++ + SUBS r2, r2, #1 ; j-- + ; Stall + SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0) + LDRGTH r14, [r4], #2 ; r14 = *x++ + SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1) + SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2) + SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3) + BLE xcorr_kernel_edsp_done + SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1) + SUBS r2, r2, #1 ; j-- + SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2) + LDRH r10, [r5], #2 ; r10 = y_4 = *y++ + SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3) + LDRGTH r12, [r4], #2 ; r12 = *x++ + SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4) + BLE xcorr_kernel_edsp_done + SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2) + CMP r2, #1 ; j-- + SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3) + LDRH r2, [r5], #2 ; r2 = y_5 = *y++ + SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4) + LDRGTH r14, [r4] ; r14 = *x + SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5) + BLE xcorr_kernel_edsp_done + SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3) + LDRH r11, [r5] ; r11 = y_6 = *y + SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4) + SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5) + SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6) +xcorr_kernel_edsp_done + LDMFD sp!, {r2,r4,r5,pc} + ENDP + +celt_pitch_xcorr_edsp PROC + ; input: + ; r0 = opus_val16 *_x (must be 32-bit aligned) + ; r1 = opus_val16 *_y (only needs to be 16-bit aligned) + ; r2 = opus_val32 *xcorr + ; r3 = int len + ; output: + ; r0 = maxcorr + ; internal usage + ; r4 = opus_val16 *x + ; r5 = opus_val16 *y + ; r6 = opus_val32 sum0 + ; r7 = opus_val32 sum1 + ; r8 = opus_val32 sum2 + ; r9 = opus_val32 sum3 + ; r1 = int max_pitch + ; r12 = int j + STMFD sp!, {r4-r11, lr} + MOV r5, r1 + LDR r1, [sp, #36] + MOV r4, r0 + TST r5, #3 + ; maxcorr = 1 + MOV r0, #1 + BEQ celt_pitch_xcorr_edsp_process1u_done +; Compute one sum at the start to make y 32-bit aligned. + SUBS r12, r3, #4 + ; r14 = sum = 0 + MOV r14, #0 + LDRH r8, [r5], #2 + BLE celt_pitch_xcorr_edsp_process1u_loop4_done + LDR r6, [r4], #4 + MOV r8, r8, LSL #16 +celt_pitch_xcorr_edsp_process1u_loop4 + LDR r9, [r5], #4 + SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) + LDR r7, [r4], #4 + SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1) + LDR r8, [r5], #4 + SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) + SUBS r12, r12, #4 ; j-=4 + SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3) + LDRGT r6, [r4], #4 + BGT celt_pitch_xcorr_edsp_process1u_loop4 + MOV r8, r8, LSR #16 +celt_pitch_xcorr_edsp_process1u_loop4_done + ADDS r12, r12, #4 +celt_pitch_xcorr_edsp_process1u_loop1 + LDRGEH r6, [r4], #2 + ; Stall + SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) + SUBGES r12, r12, #1 + LDRGTH r8, [r5], #2 + BGT celt_pitch_xcorr_edsp_process1u_loop1 + ; Restore _x + SUB r4, r4, r3, LSL #1 + ; Restore and advance _y + SUB r5, r5, r3, LSL #1 + ; maxcorr = max(maxcorr, sum) + CMP r0, r14 + ADD r5, r5, #2 + MOVLT r0, r14 + SUBS r1, r1, #1 + ; xcorr[i] = sum + STR r14, [r2], #4 + BLE celt_pitch_xcorr_edsp_done +celt_pitch_xcorr_edsp_process1u_done + ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 + SUBS r1, r1, #4 + BLT celt_pitch_xcorr_edsp_process2 +celt_pitch_xcorr_edsp_process4 + ; xcorr_kernel_edsp parameters: + ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} + MOV r6, #0 + MOV r7, #0 + MOV r8, #0 + MOV r9, #0 + BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) + ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) + CMP r0, r6 + ; _y+=4 + ADD r5, r5, #8 + MOVLT r0, r6 + CMP r0, r7 + MOVLT r0, r7 + CMP r0, r8 + MOVLT r0, r8 + CMP r0, r9 + MOVLT r0, r9 + STMIA r2!, {r6-r9} + SUBS r1, r1, #4 + BGE celt_pitch_xcorr_edsp_process4 +celt_pitch_xcorr_edsp_process2 + ADDS r1, r1, #2 + BLT celt_pitch_xcorr_edsp_process1a + SUBS r12, r3, #4 + ; {r10, r11} = {sum0, sum1} = {0, 0} + MOV r10, #0 + MOV r11, #0 + LDR r8, [r5], #4 + BLE celt_pitch_xcorr_edsp_process2_loop_done + LDR r6, [r4], #4 + LDR r9, [r5], #4 +celt_pitch_xcorr_edsp_process2_loop4 + SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) + LDR r7, [r4], #4 + SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) + SUBS r12, r12, #4 ; j-=4 + SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) + LDR r8, [r5], #4 + SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) + LDRGT r6, [r4], #4 + SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2) + SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3) + SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3) + LDRGT r9, [r5], #4 + SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4) + BGT celt_pitch_xcorr_edsp_process2_loop4 +celt_pitch_xcorr_edsp_process2_loop_done + ADDS r12, r12, #2 + BLE celt_pitch_xcorr_edsp_process2_1 + LDR r6, [r4], #4 + ; Stall + SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) + LDR r9, [r5], #4 + SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) + SUB r12, r12, #2 + SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1) + MOV r8, r9 + SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2) +celt_pitch_xcorr_edsp_process2_1 + LDRH r6, [r4], #2 + ADDS r12, r12, #1 + ; Stall + SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0) + LDRGTH r7, [r4], #2 + SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1) + BLE celt_pitch_xcorr_edsp_process2_done + LDRH r9, [r5], #2 + SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1) + SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2) +celt_pitch_xcorr_edsp_process2_done + ; Restore _x + SUB r4, r4, r3, LSL #1 + ; Restore and advance _y + SUB r5, r5, r3, LSL #1 + ; maxcorr = max(maxcorr, sum0) + CMP r0, r10 + ADD r5, r5, #2 + MOVLT r0, r10 + SUB r1, r1, #2 + ; maxcorr = max(maxcorr, sum1) + CMP r0, r11 + ; xcorr[i] = sum + STR r10, [r2], #4 + MOVLT r0, r11 + STR r11, [r2], #4 +celt_pitch_xcorr_edsp_process1a + ADDS r1, r1, #1 + BLT celt_pitch_xcorr_edsp_done + SUBS r12, r3, #4 + ; r14 = sum = 0 + MOV r14, #0 + BLT celt_pitch_xcorr_edsp_process1a_loop_done + LDR r6, [r4], #4 + LDR r8, [r5], #4 + LDR r7, [r4], #4 + LDR r9, [r5], #4 +celt_pitch_xcorr_edsp_process1a_loop4 + SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) + SUBS r12, r12, #4 ; j-=4 + SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) + LDRGE r6, [r4], #4 + SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) + LDRGE r8, [r5], #4 + SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3) + LDRGE r7, [r4], #4 + LDRGE r9, [r5], #4 + BGE celt_pitch_xcorr_edsp_process1a_loop4 +celt_pitch_xcorr_edsp_process1a_loop_done + ADDS r12, r12, #2 + LDRGE r6, [r4], #4 + LDRGE r8, [r5], #4 + ; Stall + SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) + SUBGE r12, r12, #2 + SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) + ADDS r12, r12, #1 + LDRGEH r6, [r4], #2 + LDRGEH r8, [r5], #2 + ; Stall + SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) + ; maxcorr = max(maxcorr, sum) + CMP r0, r14 + ; xcorr[i] = sum + STR r14, [r2], #4 + MOVLT r0, r14 +celt_pitch_xcorr_edsp_done + LDMFD sp!, {r4-r11, pc} + ENDP + +ENDIF + +END diff --git a/celt/arm/fixed_armv4.h b/celt/arm/fixed_armv4.h index bcacc34..b690bc8 100644 --- a/celt/arm/fixed_armv4.h +++ b/celt/arm/fixed_armv4.h @@ -29,7 +29,7 @@ /** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */ #undef MULT16_32_Q16 -static inline opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b) +static OPUS_INLINE opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b) { unsigned rd_lo; int rd_hi; @@ -46,7 +46,7 @@ static inline opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b) /** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */ #undef MULT16_32_Q15 -static inline opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b) +static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b) { unsigned rd_lo; int rd_hi; diff --git a/celt/arm/fixed_armv5e.h b/celt/arm/fixed_armv5e.h index 80632c4..1194a7d 100644 --- a/celt/arm/fixed_armv5e.h +++ b/celt/arm/fixed_armv5e.h @@ -34,7 +34,7 @@ /** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */ #undef MULT16_32_Q16 -static inline opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b) +static OPUS_INLINE opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b) { int res; __asm__( @@ -50,7 +50,7 @@ static inline opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b) /** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */ #undef MULT16_32_Q15 -static inline opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b) +static OPUS_INLINE opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b) { int res; __asm__( @@ -68,7 +68,7 @@ static inline opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b) b must fit in 31 bits. Result fits in 32 bits. */ #undef MAC16_32_Q15 -static inline opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a, +static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a, opus_val32 b) { int res; @@ -84,7 +84,7 @@ static inline opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a, /** 16x16 multiply-add where the result fits in 32 bits */ #undef MAC16_16 -static inline opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a, +static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a, opus_val16 b) { int res; @@ -100,7 +100,7 @@ static inline opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a, /** 16x16 multiplication where the result fits in 32 bits */ #undef MULT16_16 -static inline opus_val32 MULT16_16_armv5e(opus_val16 a, opus_val16 b) +static OPUS_INLINE opus_val32 MULT16_16_armv5e(opus_val16 a, opus_val16 b) { int res; __asm__( diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h new file mode 100644 index 0000000..a07f8ac --- /dev/null +++ b/celt/arm/pitch_arm.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(PITCH_ARM_H) +# define PITCH_ARM_H + +# include "armcpu.h" + +# if defined(FIXED_POINT) + +# if defined(OPUS_ARM_MAY_HAVE_NEON) +opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch); +# endif + +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) +# define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr) +# endif + +# if defined(OPUS_ARM_MAY_HAVE_EDSP) +opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch); +# endif + +# if !defined(OPUS_HAVE_RTCD) +# define OVERRIDE_PITCH_XCORR (1) +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch)) +# endif + +# endif + +#endif diff --git a/celt/bands.c b/celt/bands.c index 93bd0bc..cce56e2 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -214,7 +214,9 @@ void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X, j=M*eBands[i]; band_end = M*eBands[i+1]; lg = ADD16(bandLogE[i+c*m->nbEBands], SHL16((opus_val16)eMeans[i],6)); -#ifdef FIXED_POINT +#ifndef FIXED_POINT + g = celt_exp2(lg); +#else /* Handle the integer part of the log energy */ shift = 16-(lg>>DB_SHIFT); if (shift>31) @@ -225,9 +227,23 @@ void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X, /* Handle the fractional part. */ g = celt_exp2_frac(lg&((1<0); /*M*(eBands[end]-eBands[end-1]) <= 8 assures this*/ + celt_assert(nbBands>0); /* end has to be non-zero */ sum /= nbBands; /* Recursive averaging */ sum = (sum+*average)>>1; @@ -869,7 +885,6 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X, int q; int curr_bits; int imid=0, iside=0; - int N_B=N; int B0=B; opus_val16 mid=0, side=0; unsigned cm=0; @@ -891,8 +906,6 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X, spread = ctx->spread; ec = ctx->ec; - N_B /= B; - /* If we need 1.5 more bit than we can produce, split the band in two. */ cache = m->cache.bits + m->cache.index[(LM+1)*m->nbEBands+i]; if (LM != -1 && b > cache[cache[0]]+12 && N>2) @@ -1072,7 +1085,6 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X, longBlocks = B0==1; N_B /= B; - N_B0 = N_B; /* Special case for one sample */ if (N==1) diff --git a/celt/celt.h b/celt/celt.h index cdb76c8..5deea1f 100644 --- a/celt/celt.h +++ b/celt/celt.h @@ -66,6 +66,10 @@ typedef struct { /* Encoder/decoder Requests */ +/* Expose this option again when variable framesize actually works */ +#define OPUS_FRAMESIZE_VARIABLE 5010 /**< Optimize the frame size dynamically */ + + #define CELT_SET_PREDICTION_REQUEST 10002 /** Controls the use of interframe prediction. 0=Independent frames @@ -118,7 +122,8 @@ int celt_encoder_get_size(int channels); int celt_encode_with_ec(OpusCustomEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc); -int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels); +int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels, + int arch); @@ -138,7 +143,7 @@ int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned cha #ifdef CUSTOM_MODES #define OPUS_CUSTOM_NOSTATIC #else -#define OPUS_CUSTOM_NOSTATIC static inline +#define OPUS_CUSTOM_NOSTATIC static OPUS_INLINE #endif static const unsigned char trim_icdf[11] = {126, 124, 119, 109, 87, 41, 19, 9, 4, 2, 0}; @@ -163,7 +168,7 @@ static const unsigned char fromOpusTable[16] = { 0x00, 0x08, 0x10, 0x18 }; -static inline int toOpus(unsigned char c) +static OPUS_INLINE int toOpus(unsigned char c) { int ret=0; if (c<0xA0) @@ -174,7 +179,7 @@ static inline int toOpus(unsigned char c) return ret|(c&0x7); } -static inline int fromOpus(unsigned char c) +static OPUS_INLINE int fromOpus(unsigned char c) { if (c<0x80) return -1; @@ -190,7 +195,7 @@ extern const signed char tf_select_table[4][8]; int resampling_factor(opus_int32 rate); -void preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp, +void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp, int N, int CC, int upsample, const opus_val16 *coef, celt_sig *mem, int clip); void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c index 4424b97..830398e 100644 --- a/celt/celt_decoder.c +++ b/celt/celt_decoder.c @@ -175,7 +175,7 @@ void opus_custom_decoder_destroy(CELTDecoder *st) } #endif /* CUSTOM_MODES */ -static inline opus_val16 SIG2WORD16(celt_sig x) +static OPUS_INLINE opus_val16 SIG2WORD16(celt_sig x) { #ifdef FIXED_POINT x = PSHR32(x, SIG_SHIFT); @@ -213,7 +213,7 @@ void deemphasis(celt_sig *in[], opus_val16 *pcm, int N, int C, int downsample, c opus_val16 coef3 = coef[3]; for (j=0;j>1, opus_val16 ); - pitch_downsample(decode_mem, lp_pitch_buf, DECODE_BUFFER_SIZE, C); + pitch_downsample(decode_mem, lp_pitch_buf, + DECODE_BUFFER_SIZE, C, st->arch); pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf, DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX, - PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index); + PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, st->arch); pitch_index = PLC_PITCH_LAG_MAX-pitch_index; st->last_pitch_index = pitch_index; } else { @@ -481,7 +482,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, opus_val16 * OPUS_R opus_val32 ac[LPC_ORDER+1]; /* Compute LPC coefficients for the last MAX_PERIOD samples before the first loss so we can work in the excitation-filter domain. */ - _celt_autocorr(exc, ac, window, overlap, LPC_ORDER, MAX_PERIOD); + _celt_autocorr(exc, ac, window, overlap, + LPC_ORDER, MAX_PERIOD, st->arch); /* Add a noise floor of -40 dB. */ #ifdef FIXED_POINT ac[0] += SHR32(ac[0],13); @@ -665,7 +667,6 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat VARDECL(int, fine_priority); VARDECL(int, tf_res); VARDECL(unsigned char, collapse_masks); - celt_sig *out_mem[2]; celt_sig *decode_mem[2]; celt_sig *out_syn[2]; opus_val16 *lpc; @@ -706,7 +707,6 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat c=0; do { decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap); - out_mem[c] = decode_mem[c]+DECODE_BUFFER_SIZE-MAX_PERIOD; } while (++c_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC); oldBandE = lpc+CC*LPC_ORDER; @@ -936,7 +936,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat } while (++cupsample = resampling_factor(sampling_rate); - return OPUS_OK; -} - -OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels) +static int opus_custom_encoder_init_arch(CELTEncoder *st, const CELTMode *mode, + int channels, int arch) { if (channels < 0 || channels > 2) return OPUS_BAD_ARG; @@ -190,7 +181,7 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMod st->end = st->mode->effEBands; st->signalling = 1; - st->arch = opus_select_arch(); + st->arch = arch; st->constrained_vbr = 1; st->clip = 1; @@ -206,6 +197,25 @@ OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_init(CELTEncoder *st, const CELTMod return OPUS_OK; } +#ifdef CUSTOM_MODES +int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels) +{ + return opus_custom_encoder_init_arch(st, mode, channels, opus_select_arch()); +} +#endif + +int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels, + int arch) +{ + int ret; + ret = opus_custom_encoder_init_arch(st, + opus_custom_mode_create(48000, 960, NULL), channels, arch); + if (ret != OPUS_OK) + return ret; + st->upsample = resampling_factor(sampling_rate); + return OPUS_OK; +} + #ifdef CUSTOM_MODES void opus_custom_encoder_destroy(CELTEncoder *st) { @@ -240,7 +250,6 @@ static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int ALLOC(tmp, len, opus_val16); len2=len/2; - tf_max = 0; for (c=0;cvalid) { - trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8), QCONST16(2.f, 8)*(analysis->tonality_slope+.05f))); + trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8), + (opus_val16)(QCONST16(2.f, 8)*(analysis->tonality_slope+.05f)))); } #endif @@ -1023,11 +1035,12 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, VARDECL(opus_val16, pitch_buf); ALLOC(pitch_buf, (COMBFILTER_MAXPERIOD+N)>>1, opus_val16); - pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC); + pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC, st->arch); /* Don't search for the fir last 1.5 octave of the range because there's too many false-positives due to short-term correlation */ pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N, - COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index); + COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index, + st->arch); pitch_index = COMBFILTER_MAXPERIOD-pitch_index; gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD, @@ -1156,6 +1169,7 @@ static int compute_vbr(const CELTMode *mode, AnalysisInfo *analysis, opus_int32 coded_stereo_dof = (eBands[coded_stereo_bands]<eBands; tf_estimate = 0; if (nbCompressedBytes<2 || pcm==NULL) - return OPUS_BAD_ARG; + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } frame_size *= st->upsample; for (LM=0;LM<=mode->maxLM;LM++) if (mode->shortMdctSize<mode->maxLM) + { + RESTORE_STACK; return OPUS_BAD_ARG; + } M=1<shortMdctSize; @@ -1341,7 +1362,10 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, { int c0 = toOpus(compressed[0]); if (c0<0) + { + RESTORE_STACK; return OPUS_BAD_ARG; + } compressed[0] = c0; } compressed++; @@ -1375,6 +1399,8 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling)); effectiveBytes = nbCompressedBytes; } + if (st->bitrate != OPUS_BITRATE_MAX) + equiv_rate = st->bitrate - (40*C+20)*((400>>LM) - 50); if (enc==NULL) { @@ -1448,7 +1474,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, enc->nbits_total+=tell-ec_tell(enc); } c=0; do { - preemphasis(pcm+c, in+c*(N+st->overlap)+st->overlap, N, CC, st->upsample, + celt_preemphasis(pcm+c, in+c*(N+st->overlap)+st->overlap, N, CC, st->upsample, mode->preemph, st->preemph_memE+c, st->clip); } while (++clfe || nbAvailableBytes>12*C) && st->start==0 && !silence && !st->disable_pf + enabled = ((st->lfe&&nbAvailableBytes>3) || nbAvailableBytes>12*C) && st->start==0 && !silence && !st->disable_pf && st->complexity >= 5 && !(st->consec_transient && LM!=3 && st->variable_duration==OPUS_FRAMESIZE_VARIABLE); prefilter_tapset = st->tapset_decision; @@ -1612,7 +1638,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, if (!st->lfe) { opus_val16 follow=-QCONST16(10.0f,DB_SHIFT); - float frame_avg=0; + opus_val32 frame_avg=0; opus_val16 offset = shortBlocks?HALF16(SHL16(LM, DB_SHIFT)):0; for(i=st->start;iend;i++) { @@ -1710,7 +1736,8 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, /* Disable new spreading+tapset estimator until we can show it works better than the old one. So far it seems like spreading_decision() works best. */ - if (0&&st->analysis.valid) +#if 0 + if (st->analysis.valid) { static const opus_val16 spread_thresholds[3] = {-QCONST16(.6f, 15), -QCONST16(.2f, 15), -QCONST16(.07f, 15)}; static const opus_val16 spread_histeresis[3] = {QCONST16(.15f, 15), QCONST16(.07f, 15), QCONST16(.02f, 15)}; @@ -1718,7 +1745,9 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, static const opus_val16 tapset_histeresis[2] = {QCONST16(.1f, 15), QCONST16(.05f, 15)}; st->spread_decision = hysteresis_decision(-st->analysis.tonality, spread_thresholds, spread_histeresis, 3, st->spread_decision); st->tapset_decision = hysteresis_decision(st->analysis.tonality_slope, tapset_thresholds, tapset_histeresis, 2, st->tapset_decision); - } else { + } else +#endif + { st->spread_decision = spreading_decision(mode, X, &st->tonal_average, st->spread_decision, &st->hf_average, &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M); @@ -1777,25 +1806,18 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, if (C==2) { - int effectiveRate; - static const opus_val16 intensity_thresholds[21]= /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 off*/ - { 16,21,23,25,27,29,31,33,35,38,42,46,50,54,58,63,68,75,84,102,130}; + { 1, 2, 3, 4, 5, 6, 7, 8,16,24,36,44,50,56,62,67,72,79,88,106,134}; static const opus_val16 intensity_histeresis[21]= - { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 5, 6, 8, 12}; + { 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, 5, 6, 8, 8}; /* Always use MS for 2.5 ms frames until we can do a better analysis */ if (LM!=0) dual_stereo = stereo_analysis(mode, X, LM, N); - /* Account for coarse energy */ - effectiveRate = (8*effectiveBytes - 80)>>LM; - - /* effectiveRate in kb/s */ - effectiveRate = 2*effectiveRate/5; - - st->intensity = hysteresis_decision((opus_val16)effectiveRate, intensity_thresholds, intensity_histeresis, 21, st->intensity); + st->intensity = hysteresis_decision((opus_val16)(equiv_rate/1000), + intensity_thresholds, intensity_histeresis, 21, st->intensity); st->intensity = IMIN(st->end,IMAX(st->start, st->intensity)); } @@ -1829,7 +1851,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, if (st->constrained_vbr) base_target += (st->vbr_offset>>lm_diff); - target = compute_vbr(mode, &st->analysis, base_target, LM, st->bitrate, + target = compute_vbr(mode, &st->analysis, base_target, LM, equiv_rate, st->lastCodedBands, C, st->intensity, st->constrained_vbr, st->stereo_saving, tot_boost, tf_estimate, pitch_change, maxDepth, st->variable_duration, st->lfe, st->energy_mask!=NULL, surround_masking, @@ -1913,13 +1935,13 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, if (st->analysis.valid) { int min_bandwidth; - if (st->bitrate < (opus_int32)32000*C) + if (equiv_rate < (opus_int32)32000*C) min_bandwidth = 13; - else if (st->bitrate < (opus_int32)48000*C) + else if (equiv_rate < (opus_int32)48000*C) min_bandwidth = 16; - else if (st->bitrate < (opus_int32)60000*C) + else if (equiv_rate < (opus_int32)60000*C) min_bandwidth = 18; - else if (st->bitrate < (opus_int32)80000*C) + else if (equiv_rate < (opus_int32)80000*C) min_bandwidth = 19; else min_bandwidth = 20; diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c index 7ffe90a..fa29d62 100644 --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -226,7 +226,8 @@ int _celt_autocorr( const opus_val16 *window, int overlap, int lag, - int n + int n, + int arch ) { opus_val32 d; @@ -275,7 +276,7 @@ int _celt_autocorr( shift = 0; } #endif - celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1); + celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1, arch); for (k=0;k<=lag;k++) { for (i = k+fastN, d = 0; i < n; i++) diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h index 19279a0..dc2a0a3 100644 --- a/celt/celt_lpc.h +++ b/celt/celt_lpc.h @@ -48,6 +48,7 @@ void celt_iir(const opus_val32 *x, int ord, opus_val16 *mem); -int _celt_autocorr(const opus_val16 *x, opus_val32 *ac, const opus_val16 *window, int overlap, int lag, int n); +int _celt_autocorr(const opus_val16 *x, opus_val32 *ac, + const opus_val16 *window, int overlap, int lag, int n, int arch); #endif /* PLC_H */ diff --git a/celt/cpu_support.h b/celt/cpu_support.h index 41481fe..d68dbe6 100644 --- a/celt/cpu_support.h +++ b/celt/cpu_support.h @@ -28,7 +28,10 @@ #ifndef CPU_SUPPORT_H #define CPU_SUPPORT_H -#if defined(OPUS_HAVE_RTCD) && defined(ARMv4_ASM) +#include "opus_types.h" +#include "opus_defines.h" + +#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM) #include "arm/armcpu.h" /* We currently support 4 ARM variants: @@ -42,7 +45,7 @@ #else #define OPUS_ARCHMASK 0 -static inline int opus_select_arch(void) +static OPUS_INLINE int opus_select_arch(void) { return 0; } diff --git a/celt/cwrs.c b/celt/cwrs.c index 28e6561..ad980cc 100644 --- a/celt/cwrs.c +++ b/celt/cwrs.c @@ -410,7 +410,7 @@ static const opus_uint32 CELT_PVQ_U_DATA[1272]={ }; #if defined(CUSTOM_MODES) -const opus_uint32 *const CELT_PVQ_U_ROW[15]={ +static const opus_uint32 *const CELT_PVQ_U_ROW[15]={ CELT_PVQ_U_DATA+ 0,CELT_PVQ_U_DATA+ 208,CELT_PVQ_U_DATA+ 415, CELT_PVQ_U_DATA+ 621,CELT_PVQ_U_DATA+ 826,CELT_PVQ_U_DATA+1030, CELT_PVQ_U_DATA+1233,CELT_PVQ_U_DATA+1336,CELT_PVQ_U_DATA+1389, @@ -418,7 +418,7 @@ const opus_uint32 *const CELT_PVQ_U_ROW[15]={ CELT_PVQ_U_DATA+1464,CELT_PVQ_U_DATA+1470,CELT_PVQ_U_DATA+1473 }; #else -const opus_uint32 *const CELT_PVQ_U_ROW[15]={ +static const opus_uint32 *const CELT_PVQ_U_ROW[15]={ CELT_PVQ_U_DATA+ 0,CELT_PVQ_U_DATA+ 176,CELT_PVQ_U_DATA+ 351, CELT_PVQ_U_DATA+ 525,CELT_PVQ_U_DATA+ 698,CELT_PVQ_U_DATA+ 870, CELT_PVQ_U_DATA+1041,CELT_PVQ_U_DATA+1131,CELT_PVQ_U_DATA+1178, @@ -534,7 +534,7 @@ void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){ /*Computes the next row/column of any recurrence that obeys the relation u[i][j]=u[i-1][j]+u[i][j-1]+u[i-1][j-1]. _ui0 is the base case for the new row/column.*/ -static inline void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){ +static OPUS_INLINE void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){ opus_uint32 ui1; unsigned j; /*This do-while will overrun the array if we don't have storage for at least @@ -550,7 +550,7 @@ static inline void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){ /*Computes the previous row/column of any recurrence that obeys the relation u[i-1][j]=u[i][j]-u[i][j-1]-u[i-1][j-1]. _ui0 is the base case for the new row/column.*/ -static inline void uprev(opus_uint32 *_ui,unsigned _n,opus_uint32 _ui0){ +static OPUS_INLINE void uprev(opus_uint32 *_ui,unsigned _n,opus_uint32 _ui0){ opus_uint32 ui1; unsigned j; /*This do-while will overrun the array if we don't have storage for at least @@ -617,7 +617,7 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){ of size 1 with associated sign bits. _y: The vector of pulses, whose sum of absolute values is K. _k: Returns K.*/ -static inline opus_uint32 icwrs1(const int *_y,int *_k){ +static OPUS_INLINE opus_uint32 icwrs1(const int *_y,int *_k){ *_k=abs(_y[0]); return _y[0]<0; } @@ -626,7 +626,7 @@ static inline opus_uint32 icwrs1(const int *_y,int *_k){ of size _n with associated sign bits. _y: The vector of pulses, whose sum of absolute values must be _k. _nc: Returns V(_n,_k).*/ -static inline opus_uint32 icwrs(int _n,int _k,opus_uint32 *_nc,const int *_y, +static OPUS_INLINE opus_uint32 icwrs(int _n,int _k,opus_uint32 *_nc,const int *_y, opus_uint32 *_u){ opus_uint32 i; int j; diff --git a/celt/ecintrin.h b/celt/ecintrin.h index be57dd4..2263cff 100644 --- a/celt/ecintrin.h +++ b/celt/ecintrin.h @@ -33,7 +33,7 @@ #if !defined(_ecintrin_H) # define _ecintrin_H (1) -/*Some specific platforms may have optimized intrinsic or inline assembly +/*Some specific platforms may have optimized intrinsic or OPUS_INLINE assembly versions of these functions which can substantially improve performance. We define macros for them to allow easy incorporation of these non-ANSI features.*/ diff --git a/celt/entcode.h b/celt/entcode.h index aebecc0..dd13e49 100644 --- a/celt/entcode.h +++ b/celt/entcode.h @@ -26,6 +26,7 @@ */ #include "opus_types.h" +#include "opus_defines.h" #if !defined(_entcode_H) # define _entcode_H (1) @@ -83,15 +84,15 @@ struct ec_ctx{ int error; }; -static inline opus_uint32 ec_range_bytes(ec_ctx *_this){ +static OPUS_INLINE opus_uint32 ec_range_bytes(ec_ctx *_this){ return _this->offs; } -static inline unsigned char *ec_get_buffer(ec_ctx *_this){ +static OPUS_INLINE unsigned char *ec_get_buffer(ec_ctx *_this){ return _this->buf; } -static inline int ec_get_error(ec_ctx *_this){ +static OPUS_INLINE int ec_get_error(ec_ctx *_this){ return _this->error; } @@ -101,7 +102,7 @@ static inline int ec_get_error(ec_ctx *_this){ Return: The number of bits. This will always be slightly larger than the exact value (e.g., all rounding error is in the positive direction).*/ -static inline int ec_tell(ec_ctx *_this){ +static OPUS_INLINE int ec_tell(ec_ctx *_this){ return _this->nbits_total-EC_ILOG(_this->rng); } diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h index f11d890..80bc949 100644 --- a/celt/fixed_debug.h +++ b/celt/fixed_debug.h @@ -33,9 +33,9 @@ #define FIXED_DEBUG_H #include +#include "opus_defines.h" #ifdef CELT_C -#include "opus_defines.h" OPUS_EXPORT opus_int64 celt_mips=0; #else extern opus_int64 celt_mips; @@ -59,7 +59,7 @@ extern opus_int64 celt_mips; #define SHR(a,b) SHR32(a,b) #define PSHR(a,b) PSHR32(a,b) -static inline short NEG16(int x) +static OPUS_INLINE short NEG16(int x) { int res; if (!VERIFY_SHORT(x)) @@ -80,7 +80,7 @@ static inline short NEG16(int x) celt_mips++; return res; } -static inline int NEG32(opus_int64 x) +static OPUS_INLINE int NEG32(opus_int64 x) { opus_int64 res; if (!VERIFY_INT(x)) @@ -103,7 +103,7 @@ static inline int NEG32(opus_int64 x) } #define EXTRACT16(x) EXTRACT16_(x, __FILE__, __LINE__) -static inline short EXTRACT16_(int x, char *file, int line) +static OPUS_INLINE short EXTRACT16_(int x, char *file, int line) { int res; if (!VERIFY_SHORT(x)) @@ -119,7 +119,7 @@ static inline short EXTRACT16_(int x, char *file, int line) } #define EXTEND32(x) EXTEND32_(x, __FILE__, __LINE__) -static inline int EXTEND32_(int x, char *file, int line) +static OPUS_INLINE int EXTEND32_(int x, char *file, int line) { int res; if (!VERIFY_SHORT(x)) @@ -135,7 +135,7 @@ static inline int EXTEND32_(int x, char *file, int line) } #define SHR16(a, shift) SHR16_(a, shift, __FILE__, __LINE__) -static inline short SHR16_(int a, int shift, char *file, int line) +static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line) { int res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift)) @@ -157,7 +157,7 @@ static inline short SHR16_(int a, int shift, char *file, int line) return res; } #define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__) -static inline short SHL16_(int a, int shift, char *file, int line) +static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line) { int res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift)) @@ -179,7 +179,7 @@ static inline short SHL16_(int a, int shift, char *file, int line) return res; } -static inline int SHR32(opus_int64 a, int shift) +static OPUS_INLINE int SHR32(opus_int64 a, int shift) { opus_int64 res; if (!VERIFY_INT(a) || !VERIFY_SHORT(shift)) @@ -201,7 +201,7 @@ static inline int SHR32(opus_int64 a, int shift) return res; } #define SHL32(a, shift) SHL32_(a, shift, __FILE__, __LINE__) -static inline int SHL32_(opus_int64 a, int shift, char *file, int line) +static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line) { opus_int64 res; if (!VERIFY_INT(a) || !VERIFY_SHORT(shift)) @@ -234,7 +234,7 @@ static inline int SHL32_(opus_int64 a, int shift, char *file, int line) //#define SHL(a,shift) ((a) << (shift)) #define ADD16(a, b) ADD16_(a, b, __FILE__, __LINE__) -static inline short ADD16_(int a, int b, char *file, int line) +static OPUS_INLINE short ADD16_(int a, int b, char *file, int line) { int res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -257,7 +257,7 @@ static inline short ADD16_(int a, int b, char *file, int line) } #define SUB16(a, b) SUB16_(a, b, __FILE__, __LINE__) -static inline short SUB16_(int a, int b, char *file, int line) +static OPUS_INLINE short SUB16_(int a, int b, char *file, int line) { int res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -280,7 +280,7 @@ static inline short SUB16_(int a, int b, char *file, int line) } #define ADD32(a, b) ADD32_(a, b, __FILE__, __LINE__) -static inline int ADD32_(opus_int64 a, opus_int64 b, char *file, int line) +static OPUS_INLINE int ADD32_(opus_int64 a, opus_int64 b, char *file, int line) { opus_int64 res; if (!VERIFY_INT(a) || !VERIFY_INT(b)) @@ -303,7 +303,7 @@ static inline int ADD32_(opus_int64 a, opus_int64 b, char *file, int line) } #define SUB32(a, b) SUB32_(a, b, __FILE__, __LINE__) -static inline int SUB32_(opus_int64 a, opus_int64 b, char *file, int line) +static OPUS_INLINE int SUB32_(opus_int64 a, opus_int64 b, char *file, int line) { opus_int64 res; if (!VERIFY_INT(a) || !VERIFY_INT(b)) @@ -327,7 +327,7 @@ static inline int SUB32_(opus_int64 a, opus_int64 b, char *file, int line) #undef UADD32 #define UADD32(a, b) UADD32_(a, b, __FILE__, __LINE__) -static inline unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file, int line) +static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file, int line) { opus_uint64 res; if (!VERIFY_UINT(a) || !VERIFY_UINT(b)) @@ -351,7 +351,7 @@ static inline unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file, int #undef USUB32 #define USUB32(a, b) USUB32_(a, b, __FILE__, __LINE__) -static inline unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file, int line) +static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file, int line) { opus_uint64 res; if (!VERIFY_UINT(a) || !VERIFY_UINT(b)) @@ -381,7 +381,7 @@ static inline unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file, int } /* result fits in 16 bits */ -static inline short MULT16_16_16(int a, int b) +static OPUS_INLINE short MULT16_16_16(int a, int b) { int res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -404,7 +404,7 @@ static inline short MULT16_16_16(int a, int b) } #define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__) -static inline int MULT16_16_(int a, int b, char *file, int line) +static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line) { opus_int64 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -429,7 +429,7 @@ static inline int MULT16_16_(int a, int b, char *file, int line) #define MAC16_16(c,a,b) (celt_mips-=2,ADD32((c),MULT16_16((a),(b)))) #define MULT16_32_QX(a, b, Q) MULT16_32_QX_(a, b, Q, __FILE__, __LINE__) -static inline int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int line) +static OPUS_INLINE int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int line) { opus_int64 res; if (!VERIFY_SHORT(a) || !VERIFY_INT(b)) @@ -462,7 +462,7 @@ static inline int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int line } #define MULT16_32_PX(a, b, Q) MULT16_32_PX_(a, b, Q, __FILE__, __LINE__) -static inline int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int line) +static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int line) { opus_int64 res; if (!VERIFY_SHORT(a) || !VERIFY_INT(b)) @@ -497,7 +497,7 @@ static inline int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int line #define MULT16_32_Q15(a,b) MULT16_32_QX(a,b,15) #define MAC16_32_Q15(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q15((a),(b)))) -static inline int SATURATE(int a, int b) +static OPUS_INLINE int SATURATE(int a, int b) { if (a>b) a=b; @@ -507,7 +507,17 @@ static inline int SATURATE(int a, int b) return a; } -static inline int MULT16_16_Q11_32(int a, int b) +static OPUS_INLINE opus_int16 SATURATE16(opus_int32 a) +{ + celt_mips+=3; + if (a>32767) + return 32767; + else if (a<-32768) + return -32768; + else return a; +} + +static OPUS_INLINE int MULT16_16_Q11_32(int a, int b) { opus_int64 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -529,7 +539,7 @@ static inline int MULT16_16_Q11_32(int a, int b) celt_mips+=3; return res; } -static inline short MULT16_16_Q13(int a, int b) +static OPUS_INLINE short MULT16_16_Q13(int a, int b) { opus_int64 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -551,7 +561,7 @@ static inline short MULT16_16_Q13(int a, int b) celt_mips+=3; return res; } -static inline short MULT16_16_Q14(int a, int b) +static OPUS_INLINE short MULT16_16_Q14(int a, int b) { opus_int64 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -575,7 +585,7 @@ static inline short MULT16_16_Q14(int a, int b) } #define MULT16_16_Q15(a, b) MULT16_16_Q15_(a, b, __FILE__, __LINE__) -static inline short MULT16_16_Q15_(int a, int b, char *file, int line) +static OPUS_INLINE short MULT16_16_Q15_(int a, int b, char *file, int line) { opus_int64 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -598,7 +608,7 @@ static inline short MULT16_16_Q15_(int a, int b, char *file, int line) return res; } -static inline short MULT16_16_P13(int a, int b) +static OPUS_INLINE short MULT16_16_P13(int a, int b) { opus_int64 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -628,7 +638,7 @@ static inline short MULT16_16_P13(int a, int b) celt_mips+=4; return res; } -static inline short MULT16_16_P14(int a, int b) +static OPUS_INLINE short MULT16_16_P14(int a, int b) { opus_int64 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -658,7 +668,7 @@ static inline short MULT16_16_P14(int a, int b) celt_mips+=4; return res; } -static inline short MULT16_16_P15(int a, int b) +static OPUS_INLINE short MULT16_16_P15(int a, int b) { opus_int64 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(b)) @@ -691,7 +701,7 @@ static inline short MULT16_16_P15(int a, int b) #define DIV32_16(a, b) DIV32_16_(a, b, __FILE__, __LINE__) -static inline int DIV32_16_(opus_int64 a, opus_int64 b, char *file, int line) +static OPUS_INLINE int DIV32_16_(opus_int64 a, opus_int64 b, char *file, int line) { opus_int64 res; if (b==0) @@ -726,7 +736,7 @@ static inline int DIV32_16_(opus_int64 a, opus_int64 b, char *file, int line) } #define DIV32(a, b) DIV32_(a, b, __FILE__, __LINE__) -static inline int DIV32_(opus_int64 a, opus_int64 b, char *file, int line) +static OPUS_INLINE int DIV32_(opus_int64 a, opus_int64 b, char *file, int line) { opus_int64 res; if (b==0) diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h index 657e67c..ecf018a 100644 --- a/celt/fixed_generic.h +++ b/celt/fixed_generic.h @@ -40,7 +40,7 @@ #define MULT16_32_Q16(a,b) ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16)) /** 16x32 multiplication, followed by a 16-bit shift right (round-to-nearest). Results fits in 32 bits */ -#define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16((a),((b)&0x0000ffff)),16)) +#define MULT16_32_P16(a,b) ADD32(MULT16_16((a),SHR((b),16)), PSHR(MULT16_16SU((a),((b)&0x0000ffff)),16)) /** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */ #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15)) diff --git a/celt/float_cast.h b/celt/float_cast.h index 5ded291..ede6574 100644 --- a/celt/float_cast.h +++ b/celt/float_cast.h @@ -101,7 +101,7 @@ #include /* Win32 doesn't seem to have these functions. - ** Therefore implement inline versions of these functions here. + ** Therefore implement OPUS_INLINE versions of these functions here. */ __inline long int @@ -128,7 +128,7 @@ #endif #ifndef DISABLE_FLOAT_API -static inline opus_int16 FLOAT2INT16(float x) +static OPUS_INLINE opus_int16 FLOAT2INT16(float x) { x = x*CELT_SIG_SCALE; x = MAX32(x, -32768); diff --git a/celt/mathops.c b/celt/mathops.c index 21fd942..3f8c5dc 100644 --- a/celt/mathops.c +++ b/celt/mathops.c @@ -139,7 +139,7 @@ opus_val32 celt_sqrt(opus_val32 x) #define L3 8277 #define L4 -626 -static inline opus_val16 _celt_cos_pi_2(opus_val16 x) +static OPUS_INLINE opus_val16 _celt_cos_pi_2(opus_val16 x) { opus_val16 x2; diff --git a/celt/mathops.h b/celt/mathops.h index 7e7d906..a0525a9 100644 --- a/celt/mathops.h +++ b/celt/mathops.h @@ -44,7 +44,7 @@ unsigned isqrt32(opus_uint32 _val); #ifndef OVERRIDE_CELT_MAXABS16 -static inline opus_val32 celt_maxabs16(const opus_val16 *x, int len) +static OPUS_INLINE opus_val32 celt_maxabs16(const opus_val16 *x, int len) { int i; opus_val16 maxval = 0; @@ -60,7 +60,7 @@ static inline opus_val32 celt_maxabs16(const opus_val16 *x, int len) #ifndef OVERRIDE_CELT_MAXABS32 #ifdef FIXED_POINT -static inline opus_val32 celt_maxabs32(const opus_val32 *x, int len) +static OPUS_INLINE opus_val32 celt_maxabs32(const opus_val32 *x, int len) { int i; opus_val32 maxval = 0; @@ -95,7 +95,7 @@ static inline opus_val32 celt_maxabs32(const opus_val32 *x, int len) denorm, +/- inf and NaN are *not* handled */ /** Base-2 log approximation (log2(x)). */ -static inline float celt_log2(float x) +static OPUS_INLINE float celt_log2(float x) { int integer; float frac; @@ -113,7 +113,7 @@ static inline float celt_log2(float x) } /** Base-2 exponential approximation (2^x). */ -static inline float celt_exp2(float x) +static OPUS_INLINE float celt_exp2(float x) { int integer; float frac; @@ -145,7 +145,7 @@ static inline float celt_exp2(float x) #ifndef OVERRIDE_CELT_ILOG2 /** Integer log in base2. Undefined for zero and negative numbers */ -static inline opus_int16 celt_ilog2(opus_int32 x) +static OPUS_INLINE opus_int16 celt_ilog2(opus_int32 x) { celt_assert2(x>0, "celt_ilog2() only defined for strictly positive numbers"); return EC_ILOG(x)-1; @@ -154,7 +154,7 @@ static inline opus_int16 celt_ilog2(opus_int32 x) /** Integer log in base2. Defined for zero, but not for negative numbers */ -static inline opus_int16 celt_zlog2(opus_val32 x) +static OPUS_INLINE opus_int16 celt_zlog2(opus_val32 x) { return x <= 0 ? 0 : celt_ilog2(x); } @@ -166,7 +166,7 @@ opus_val32 celt_sqrt(opus_val32 x); opus_val16 celt_cos_norm(opus_val32 x); /** Base-2 logarithm approximation (log2(x)). (Q14 input, Q10 output) */ -static inline opus_val16 celt_log2(opus_val32 x) +static OPUS_INLINE opus_val16 celt_log2(opus_val32 x) { int i; opus_val16 n, frac; @@ -192,14 +192,14 @@ static inline opus_val16 celt_log2(opus_val32 x) #define D2 14819 #define D3 10204 -static inline opus_val32 celt_exp2_frac(opus_val16 x) +static OPUS_INLINE opus_val32 celt_exp2_frac(opus_val16 x) { opus_val16 frac; frac = SHL16(x, 4); return ADD16(D0, MULT16_16_Q15(frac, ADD16(D1, MULT16_16_Q15(frac, ADD16(D2 , MULT16_16_Q15(D3,frac)))))); } /** Base-2 exponential approximation (2^x). (Q10 input, Q16 output) */ -static inline opus_val32 celt_exp2(opus_val16 x) +static OPUS_INLINE opus_val32 celt_exp2(opus_val16 x) { int integer; opus_val16 frac; @@ -225,7 +225,7 @@ opus_val32 frac_div32(opus_val32 a, opus_val32 b); /* Atan approximation using a 4th order polynomial. Input is in Q15 format and normalized by pi/4. Output is in Q15 format */ -static inline opus_val16 celt_atan01(opus_val16 x) +static OPUS_INLINE opus_val16 celt_atan01(opus_val16 x) { return MULT16_16_P15(x, ADD32(M1, MULT16_16_P15(x, ADD32(M2, MULT16_16_P15(x, ADD32(M3, MULT16_16_P15(M4, x))))))); } @@ -236,7 +236,7 @@ static inline opus_val16 celt_atan01(opus_val16 x) #undef M4 /* atan2() approximation valid for positive input values */ -static inline opus_val16 celt_atan2p(opus_val16 y, opus_val16 x) +static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x) { if (y < x) { diff --git a/celt/os_support.h b/celt/os_support.h index 2484f0b..5e47e3c 100644 --- a/celt/os_support.h +++ b/celt/os_support.h @@ -35,13 +35,16 @@ # include "custom_support.h" #endif +#include "opus_types.h" +#include "opus_defines.h" + #include #include #include /** Opus wrapper for malloc(). To do your own dynamic allocation, all you need to do is replace this function and opus_free */ #ifndef OVERRIDE_OPUS_ALLOC -static inline void *opus_alloc (size_t size) +static OPUS_INLINE void *opus_alloc (size_t size) { return malloc(size); } @@ -49,7 +52,7 @@ static inline void *opus_alloc (size_t size) /** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */ #ifndef OVERRIDE_OPUS_ALLOC_SCRATCH -static inline void *opus_alloc_scratch (size_t size) +static OPUS_INLINE void *opus_alloc_scratch (size_t size) { /* Scratch space doesn't need to be cleared */ return opus_alloc(size); @@ -58,7 +61,7 @@ static inline void *opus_alloc_scratch (size_t size) /** Opus wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function and opus_alloc */ #ifndef OVERRIDE_OPUS_FREE -static inline void opus_free (void *ptr) +static OPUS_INLINE void opus_free (void *ptr) { free(ptr); } diff --git a/celt/pitch.c b/celt/pitch.c index 0352b30..d2b3054 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -145,7 +145,7 @@ static void celt_fir5(const opus_val16 *x, void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp, - int len, int C) + int len, int C, int arch) { int i; opus_val32 ac[5]; @@ -180,7 +180,7 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x } _celt_autocorr(x_lp, ac, NULL, 0, - 4, len>>1); + 4, len>>1, arch); /* Noise floor -40 dB */ #ifdef FIXED_POINT @@ -250,9 +250,14 @@ opus_val32 #else void #endif -celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch) +celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch) { int i,j; + /*The EDSP version requires that max_pitch is at least 1, and that _x is + 32-bit aligned. + Since it's hard to put asserts in assembly, put them here.*/ + celt_assert(max_pitch>0); + celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); #ifdef FIXED_POINT opus_val32 maxcorr=1; #endif @@ -289,7 +294,7 @@ celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, #endif void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y, - int len, int max_pitch, int *pitch) + int len, int max_pitch, int *pitch, int arch) { int i, j; int lag; @@ -342,7 +347,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR #ifdef FIXED_POINT maxcorr = #endif - celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2); + celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2, arch); find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch #ifdef FIXED_POINT diff --git a/celt/pitch.h b/celt/pitch.h index caffd24..df317ec 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -35,16 +35,21 @@ #define PITCH_H #include "modes.h" +#include "cpu_support.h" #if defined(__SSE__) && !defined(FIXED_POINT) #include "x86/pitch_sse.h" #endif +#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT) +# include "arm/pitch_arm.h" +#endif + void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp, - int len, int C); + int len, int C, int arch); void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y, - int len, int max_pitch, int *pitch); + int len, int max_pitch, int *pitch, int arch); opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, int N, int *T0, int prev_period, opus_val16 prev_gain); @@ -52,10 +57,11 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, /* OPT: This is the kernel you really want to optimize. It gets used a lot by the prefilter and by the PLC. */ #ifndef OVERRIDE_XCORR_KERNEL -static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) +static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) { int j; opus_val16 y_0, y_1, y_2, y_3; + celt_assert(len>=3); y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */ y_0=*y++; y_1=*y++; @@ -119,7 +125,7 @@ static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus #endif /* OVERRIDE_XCORR_KERNEL */ #ifndef OVERRIDE_DUAL_INNER_PROD -static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, +static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) { int i; @@ -140,6 +146,28 @@ opus_val32 #else void #endif -celt_pitch_xcorr(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch); +celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch); + +#if !defined(OVERRIDE_PITCH_XCORR) +/*Is run-time CPU detection enabled on this platform?*/ +# if defined(OPUS_HAVE_RTCD) +extern +# if defined(FIXED_POINT) +opus_val32 +# else +void +# endif +(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val32 *, int, int); + +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \ + xcorr, len, max_pitch)) +# else +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch)) +# endif +#endif #endif diff --git a/celt/quant_bands.c b/celt/quant_bands.c index 79685e1..ac6952c 100644 --- a/celt/quant_bands.c +++ b/celt/quant_bands.c @@ -312,6 +312,7 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd, opus_int32 tell_intra; opus_uint32 nstart_bytes; opus_uint32 nintra_bytes; + opus_uint32 save_bytes; int badness2; VARDECL(unsigned char, intra_bits); @@ -322,7 +323,10 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd, nstart_bytes = ec_range_bytes(&enc_start_state); nintra_bytes = ec_range_bytes(&enc_intra_state); intra_buf = ec_get_buffer(&enc_intra_state) + nstart_bytes; - ALLOC(intra_bits, nintra_bytes-nstart_bytes, unsigned char); + save_bytes = nintra_bytes-nstart_bytes; + if (save_bytes == 0) + save_bytes = ALLOC_NONE; + ALLOC(intra_bits, save_bytes, unsigned char); /* Copy bits from intra bit-stream */ OPUS_COPY(intra_bits, intra_buf, nintra_bytes - nstart_bytes); diff --git a/celt/rate.c b/celt/rate.c index e474cf5..e13d839 100644 --- a/celt/rate.c +++ b/celt/rate.c @@ -245,7 +245,7 @@ void compute_pulse_cache(CELTMode *m, int LM) #define ALLOC_STEPS 6 -static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int skip_start, +static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end, int skip_start, const int *bits1, const int *bits2, const int *thresh, const int *cap, opus_int32 total, opus_int32 *_balance, int skip_rsv, int *intensity, int intensity_rsv, int *dual_stereo, int dual_stereo_rsv, int *bits, int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth) diff --git a/celt/rate.h b/celt/rate.h index 263fde9..f1e0661 100644 --- a/celt/rate.h +++ b/celt/rate.h @@ -45,12 +45,12 @@ void compute_pulse_cache(CELTMode *m, int LM); -static inline int get_pulses(int i) +static OPUS_INLINE int get_pulses(int i) { return i<8 ? i : (8 + (i&7)) << ((i>>3)-1); } -static inline int bits2pulses(const CELTMode *m, int band, int LM, int bits) +static OPUS_INLINE int bits2pulses(const CELTMode *m, int band, int LM, int bits) { int i; int lo, hi; @@ -77,7 +77,7 @@ static inline int bits2pulses(const CELTMode *m, int band, int LM, int bits) return hi; } -static inline int pulses2bits(const CELTMode *m, int band, int LM, int pulses) +static OPUS_INLINE int pulses2bits(const CELTMode *m, int band, int LM, int pulses) { const unsigned char *cache; diff --git a/celt/stack_alloc.h b/celt/stack_alloc.h index 1c093a8..316a6ce 100644 --- a/celt/stack_alloc.h +++ b/celt/stack_alloc.h @@ -32,6 +32,9 @@ #ifndef STACK_ALLOC_H #define STACK_ALLOC_H +#include "opus_types.h" +#include "opus_defines.h" + #if (!defined (VAR_ARRAYS) && !defined (USE_ALLOCA) && !defined (NONTHREADSAFE_PSEUDOSTACK)) #error "Opus requires one of VAR_ARRAYS, USE_ALLOCA, or NONTHREADSAFE_PSEUDOSTACK be defined to select the temporary allocation mode." #endif @@ -92,6 +95,8 @@ #define SAVE_STACK #define RESTORE_STACK #define ALLOC_STACK +/* C99 does not allow VLAs of size zero */ +#define ALLOC_NONE 1 #elif defined(USE_ALLOCA) @@ -106,6 +111,7 @@ #define SAVE_STACK #define RESTORE_STACK #define ALLOC_STACK +#define ALLOC_NONE 0 #else @@ -143,6 +149,7 @@ extern char *global_stack_top; #define VARDECL(type, var) type *var #define ALLOC(var, size, type) var = PUSH(global_stack, size, type) #define SAVE_STACK char *_saved_stack = global_stack; +#define ALLOC_NONE 0 #endif /* VAR_ARRAYS */ @@ -159,7 +166,7 @@ extern char *global_stack_top; #else -static inline int _opus_false(void) {return 0;} +static OPUS_INLINE int _opus_false(void) {return 0;} #define OPUS_CHECK_ARRAY(ptr, len) _opus_false() #define OPUS_CHECK_VALUE(value) _opus_false() #define OPUS_PRINT_INT(value) do{}while(0) diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index 63ae3d4..695122a 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -36,7 +36,7 @@ #include "arch.h" #define OVERRIDE_XCORR_KERNEL -static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) +static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) { int j; __m128 xsum1, xsum2; @@ -72,7 +72,7 @@ static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_v } #define OVERRIDE_DUAL_INNER_PROD -static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, +static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) { int i; @@ -102,7 +102,7 @@ static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, c } #define OVERRIDE_COMB_FILTER_CONST -static inline void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, +static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12) { int i; -- cgit v1.2.3