aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-07-07 05:12:29 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-07-07 05:12:29 +0000
commit2392ca434cdd13464fd35633462326c68711add7 (patch)
treeb341d3f924dbdedcb41b77de3cff93833290881e
parentd6ec3ccdf3c873ea461eb340f58d9e072e9bb3a4 (diff)
parentfb61b35abb4b7d3f9d6efde2879fa0da61c14f3b (diff)
downloadlibopus-android14-mainline-uwb-release.tar.gz
Change-Id: If70323a4896a46f69ee1931b3faaf1f8e899a92a
-rw-r--r--Android.bp2
-rw-r--r--CMakeLists.txt80
-rw-r--r--METADATA16
-rw-r--r--Makefile.am15
-rw-r--r--celt/arm/armcpu.c9
-rw-r--r--celt/arm/pitch_neon_intr.c51
-rw-r--r--celt/bands.c9
-rw-r--r--celt/celt_decoder.c57
-rw-r--r--celt/celt_encoder.c14
-rw-r--r--celt/celt_lpc.c72
-rw-r--r--celt/cpu_support.h5
-rw-r--r--celt/fixed_debug.h71
-rw-r--r--celt/fixed_generic.h10
-rw-r--r--celt/float_cast.h4
-rw-r--r--celt/kiss_fft.h12
-rw-r--r--celt/mathops.h2
-rw-r--r--celt/meson.build8
-rw-r--r--celt/modes.c3
-rw-r--r--celt/pitch.c23
-rw-r--r--celt/rate.c2
-rw-r--r--celt/tests/test_unit_dft.c3
-rw-r--r--celt/tests/test_unit_entropy.c4
-rw-r--r--celt/tests/test_unit_mdct.c3
-rw-r--r--celt/x86/celt_lpc_sse.h5
-rw-r--r--celt/x86/pitch_sse.h6
-rw-r--r--celt/x86/pitch_sse4_1.c51
-rw-r--r--celt/x86/x86cpu.c37
-rw-r--r--celt/x86/x86cpu.h46
-rw-r--r--celt_sources.mk8
-rw-r--r--cmake/OpusConfig.cmake29
-rw-r--r--cmake/OpusFunctions.cmake30
-rw-r--r--cmake/OpusSources.cmake5
-rw-r--r--cmake/RunTest.cmake61
-rw-r--r--cmake/cpu_info_by_asm.c31
-rw-r--r--cmake/cpu_info_by_c.c9
-rw-r--r--configure.ac7
-rw-r--r--fuzzer/Android.bp8
-rw-r--r--include/opus.h4
-rw-r--r--include/opus_custom.h3
-rw-r--r--include/opus_defines.h6
-rw-r--r--libopus_blocklist.txt8
-rw-r--r--meson.build4
-rwxr-xr-xmeson/get-version.py2
-rw-r--r--opus.m42
-rw-r--r--silk/LPC_fit.c3
-rw-r--r--silk/MacroDebug.h54
-rw-r--r--silk/NSQ.c36
-rw-r--r--silk/NSQ_del_dec.c28
-rw-r--r--silk/SigProc_FIX.h10
-rw-r--r--silk/VQ_WMat_EC.c4
-rw-r--r--silk/bwexpander_32.c3
-rw-r--r--silk/control_codec.c2
-rw-r--r--silk/enc_API.c15
-rw-r--r--silk/fixed/LTP_scale_ctrl_FIX.c11
-rw-r--r--silk/fixed/burg_modified_FIX.c8
-rw-r--r--silk/fixed/find_pred_coefs_FIX.c3
-rw-r--r--silk/fixed/vector_ops_FIX.c2
-rw-r--r--silk/fixed/x86/burg_modified_FIX_sse4_1.c69
-rw-r--r--silk/fixed/x86/prefilter_FIX_sse.c160
-rw-r--r--silk/fixed/x86/vector_ops_FIX_sse4_1.c43
-rw-r--r--silk/float/LTP_scale_ctrl_FLP.c10
-rw-r--r--silk/float/find_pred_coefs_FLP.c3
-rw-r--r--silk/float/wrappers_FLP.c10
-rw-r--r--silk/main.h60
-rw-r--r--silk/meson.build10
-rw-r--r--silk/stereo_LR_to_MS.c8
-rw-r--r--silk/stereo_MS_to_LR.c4
-rw-r--r--silk/tests/test_unit_LPC_inv_pred_gain.c2
-rw-r--r--silk/x86/NSQ_del_dec_sse4_1.c180
-rw-r--r--silk/x86/NSQ_sse4_1.c213
-rw-r--r--silk/x86/SigProc_FIX_sse.h49
-rw-r--r--silk/x86/VAD_sse4_1.c28
-rw-r--r--silk/x86/VQ_WMat_EC_sse4_1.c189
-rw-r--r--silk/x86/main_sse.h237
-rw-r--r--silk/x86/x86_silk_map.c91
-rw-r--r--silk_sources.mk10
-rw-r--r--src/opus_decoder.c15
-rw-r--r--src/opus_encoder.c43
-rw-r--r--src/opus_multistream_encoder.c9
-rwxr-xr-xtests/opus_build_test.sh29
-rw-r--r--tests/opus_encode_regressions.c35
-rwxr-xr-xtests/random_config.sh126
-rw-r--r--tests/test_opus_api.c2
-rw-r--r--tests/test_opus_common.h2
-rw-r--r--tests/test_opus_encode.c4
-rw-r--r--tests/test_opus_padding.c9
-rw-r--r--tests/test_opus_projection.c1
87 files changed, 1639 insertions, 1028 deletions
diff --git a/Android.bp b/Android.bp
index f2f09f8d..29a421c4 100644
--- a/Android.bp
+++ b/Android.bp
@@ -383,7 +383,7 @@ cc_library {
apex_available: [
"//apex_available:platform", // used by libstagefright_soft_opusdec
"com.android.media.swcodec",
- "com.android.bluetooth",
+ "com.android.btservices",
],
min_sdk_version: "29",
}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a28f441c..9d824cdc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -366,11 +366,23 @@ if(NOT OPUS_ENABLE_FLOAT_API)
endif()
if(NOT OPUS_DISABLE_INTRINSICS)
- if((OPUS_X86_MAY_HAVE_SSE AND NOT OPUS_X86_PRESUME_SSE) OR
+ if(((OPUS_X86_MAY_HAVE_SSE AND NOT OPUS_X86_PRESUME_SSE) OR
(OPUS_X86_MAY_HAVE_SSE2 AND NOT OPUS_X86_PRESUME_SSE2) OR
(OPUS_X86_MAY_HAVE_SSE4_1 AND NOT OPUS_X86_PRESUME_SSE4_1) OR
- (OPUS_X86_MAY_HAVE_AVX AND NOT OPUS_X86_PRESUME_AVX))
+ (OPUS_X86_MAY_HAVE_AVX AND NOT OPUS_X86_PRESUME_AVX)) AND
+ RUNTIME_CPU_CAPABILITY_DETECTION)
target_compile_definitions(opus PRIVATE OPUS_HAVE_RTCD)
+ if(NOT MSVC)
+ if(CPU_INFO_BY_ASM_SUPPORTED)
+ target_compile_definitions(opus PRIVATE CPU_INFO_BY_ASM)
+ elseif(CPU_INFO_BY_C_SUPPORTED)
+ target_compile_definitions(opus PRIVATE CPU_INFO_BY_C)
+ else()
+ message(ERROR "Runtime cpu capability detection is enabled while CPU_INFO is not supported")
+ endif()
+ endif()
+ add_sources_group(opus celt ${celt_sources_x86_rtcd})
+ add_sources_group(opus silk ${silk_sources_x86_rtcd})
endif()
if(SSE1_SUPPORTED)
@@ -455,15 +467,13 @@ if(NOT OPUS_DISABLE_INTRINSICS)
endif()
endif()
- if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64)")
- add_sources_group(opus celt ${celt_sources_arm})
- endif()
-
if(COMPILER_SUPPORT_NEON)
if(OPUS_MAY_HAVE_NEON)
if(RUNTIME_CPU_CAPABILITY_DETECTION)
message(STATUS "OPUS_MAY_HAVE_NEON enabling runtime detection")
target_compile_definitions(opus PRIVATE OPUS_HAVE_RTCD)
+ add_sources_group(opus celt ${celt_sources_arm_rtcd})
+ add_sources_group(opus silk ${silk_sources_arm_rtcd})
else()
message(ERROR "Runtime cpu capability detection needed for MAY_HAVE_NEON")
endif()
@@ -565,6 +575,7 @@ if(OPUS_BUILD_PROGRAMS)
target_include_directories(opus_custom_demo
PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
target_link_libraries(opus_custom_demo PRIVATE opus)
+ target_compile_definitions(opus_custom_demo PRIVATE OPUS_BUILD)
endif()
add_executable(opus_demo ${opus_demo_sources})
@@ -572,14 +583,19 @@ if(OPUS_BUILD_PROGRAMS)
target_include_directories(opus_demo PRIVATE silk) # debug.h
target_include_directories(opus_demo PRIVATE celt) # arch.h
target_link_libraries(opus_demo PRIVATE opus ${OPUS_REQUIRED_LIBRARIES})
+ target_compile_definitions(opus_demo PRIVATE OPUS_BUILD)
# compare
add_executable(opus_compare ${opus_compare_sources})
target_include_directories(opus_compare PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
target_link_libraries(opus_compare PRIVATE opus ${OPUS_REQUIRED_LIBRARIES})
+ if(MSVC)
+ # move cosmetic warning to level 4 for opus_compare
+ target_compile_options(opus_compare PRIVATE /w44244)
+ endif()
endif()
-if(BUILD_TESTING)
+if(BUILD_TESTING AND NOT BUILD_SHARED_LIBS)
enable_testing()
# tests
@@ -587,32 +603,44 @@ if(BUILD_TESTING)
target_include_directories(test_opus_decode
PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
target_link_libraries(test_opus_decode PRIVATE opus)
+ target_compile_definitions(test_opus_decode PRIVATE OPUS_BUILD)
if(OPUS_FIXED_POINT)
target_compile_definitions(test_opus_decode PRIVATE DISABLE_FLOAT_API)
endif()
- add_test(NAME test_opus_decode COMMAND $<TARGET_FILE:test_opus_decode> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+ add_test(NAME test_opus_decode COMMAND ${CMAKE_COMMAND}
+ -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_decode>
+ -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+ -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
add_executable(test_opus_padding ${test_opus_padding_sources})
target_include_directories(test_opus_padding
PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
target_link_libraries(test_opus_padding PRIVATE opus)
- add_test(NAME test_opus_padding COMMAND $<TARGET_FILE:test_opus_padding> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-
- if(NOT BUILD_SHARED_LIBS)
- # disable tests that depends on private API when building shared lib
- add_executable(test_opus_api ${test_opus_api_sources})
- target_include_directories(test_opus_api
- PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
- target_link_libraries(test_opus_api PRIVATE opus)
- if(OPUS_FIXED_POINT)
- target_compile_definitions(test_opus_api PRIVATE DISABLE_FLOAT_API)
- endif()
- add_test(NAME test_opus_api COMMAND $<TARGET_FILE:test_opus_api> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-
- add_executable(test_opus_encode ${test_opus_encode_sources})
- target_include_directories(test_opus_encode
- PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
- target_link_libraries(test_opus_encode PRIVATE opus)
- add_test(NAME test_opus_encode COMMAND $<TARGET_FILE:test_opus_encode> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+ add_test(NAME test_opus_padding COMMAND ${CMAKE_COMMAND}
+ -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_padding>
+ -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+ -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
+
+ add_executable(test_opus_api ${test_opus_api_sources})
+ target_include_directories(test_opus_api
+ PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
+ target_link_libraries(test_opus_api PRIVATE opus)
+ target_compile_definitions(test_opus_api PRIVATE OPUS_BUILD)
+ if(OPUS_FIXED_POINT)
+ target_compile_definitions(test_opus_api PRIVATE DISABLE_FLOAT_API)
endif()
+ add_test(NAME test_opus_api COMMAND ${CMAKE_COMMAND}
+ -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_api>
+ -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+ -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
+
+ add_executable(test_opus_encode ${test_opus_encode_sources})
+ target_include_directories(test_opus_encode
+ PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
+ target_link_libraries(test_opus_encode PRIVATE opus)
+ target_compile_definitions(test_opus_encode PRIVATE OPUS_BUILD)
+ add_test(NAME test_opus_encode COMMAND ${CMAKE_COMMAND}
+ -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_encode>
+ -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+ -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
endif()
diff --git a/METADATA b/METADATA
index cb40ec68..616c521c 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update libopus
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
name: "libopus"
description: "Android fork of the opus library."
third_party {
@@ -5,14 +9,14 @@ third_party {
type: GIT
value: "https://gitlab.xiph.org/xiph/opus.git"
}
- version: "d633f523e36e3b6d01cc6d57386458d770d618be"
+ version: "8cf872a186b96085b1bb3a547afd598354ebeb87"
license_type: NOTICE
- last_upgrade_date {
- year: 2021
- month: 2
- day: 5
- }
security {
tag: "NVD-CPE2.3:cpe:/a:opus-codec:opus:1.0.3"
}
+ last_upgrade_date {
+ year: 2023
+ month: 1
+ day: 18
+ }
}
diff --git a/Makefile.am b/Makefile.am
index 83beaa3f..492fc09d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -36,6 +36,11 @@ else
OPUS_SOURCES += $(OPUS_SOURCES_FLOAT)
endif
+if CPU_X86
+if HAVE_RTCD
+CELT_SOURCES += $(CELT_SOURCES_X86_RTCD)
+SILK_SOURCES += $(SILK_SOURCES_X86_RTCD)
+endif
if HAVE_SSE
CELT_SOURCES += $(CELT_SOURCES_SSE)
endif
@@ -45,10 +50,13 @@ endif
if HAVE_SSE4_1
CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
endif
+endif
if CPU_ARM
-CELT_SOURCES += $(CELT_SOURCES_ARM)
-SILK_SOURCES += $(SILK_SOURCES_ARM)
+if HAVE_RTCD
+CELT_SOURCES += $(CELT_SOURCES_ARM_RTCD)
+SILK_SOURCES += $(SILK_SOURCES_ARM_RTCD)
+endif
if HAVE_ARM_NEON_INTR
CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
@@ -222,8 +230,11 @@ EXTRA_DIST = opus.pc.in \
cmake/OpusFunctions.cmake \
cmake/OpusPackageVersion.cmake \
cmake/OpusSources.cmake \
+ cmake/RunTest.cmake \
cmake/config.h.cmake.in \
cmake/vla.c \
+ cmake/cpu_info_by_asm.c \
+ cmake/cpu_info_by_c.c \
meson/get-version.py \
meson/read-sources-list.py \
meson.build \
diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
index cce3ae3a..c7d16e6d 100644
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -156,7 +156,7 @@ opus_uint32 opus_cpu_capabilities(void)
"your platform. Reconfigure with --disable-rtcd (or send patches)."
#endif
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
{
opus_uint32 flags = opus_cpu_capabilities();
int arch = 0;
@@ -184,4 +184,11 @@ int opus_select_arch(void)
return arch;
}
+int opus_select_arch(void) {
+ int arch = opus_select_arch_impl();
+#ifdef FUZZING
+ arch = rand()%(arch+1);
+#endif
+ return arch;
+}
#endif
diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c
index 1ac38c43..35cc46e2 100644
--- a/celt/arm/pitch_neon_intr.c
+++ b/celt/arm/pitch_neon_intr.c
@@ -137,22 +137,27 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */
/* operations of celt_inner_prod_neon(), and both functions should have bit */
/* exact output. */
-static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
{
int i;
+ *err = 0;
opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
for (i = 0; i < N - 3; i += 4) {
xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+ *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
}
xy0 += xy2;
xy1 += xy3;
xy = xy0 + xy1;
+ *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
for (; i < N; i++) {
xy = MAC16_16(xy, x[i], y[i]);
+ *err += ABS32(xy);
}
+ *err = *err*2e-7 + N*1e-37;
return xy;
}
@@ -160,32 +165,10 @@ static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, c
/* operations of dual_inner_prod_neon(), and both functions should have bit */
/* exact output. */
static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
- int N, opus_val32 *xy1, opus_val32 *xy2)
+ int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
{
- int i;
- opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
- for (i = 0; i < N - 3; i += 4) {
- xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
- xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
- xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
- xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
- xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
- xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
- xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
- xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
- }
- xy01_0 += xy01_2;
- xy02_0 += xy02_2;
- xy01_1 += xy01_3;
- xy02_1 += xy02_3;
- xy01 = xy01_0 + xy01_1;
- xy02 = xy02_0 + xy02_1;
- for (; i < N; i++) {
- xy01 = MAC16_16(xy01, x[i], y01[i]);
- xy02 = MAC16_16(xy02, x[i], y02[i]);
- }
- *xy1 = xy01;
- *xy2 = xy02;
+ *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
+ *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
}
#endif /* OPUS_CHECK_ASM */
@@ -225,7 +208,12 @@ opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
}
#ifdef OPUS_CHECK_ASM
- celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
+ {
+ float err, res;
+ res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
+ /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
+ celt_assert(ABS32(res - xy) <= err);
+ }
#endif
return xy;
@@ -280,9 +268,12 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
#ifdef OPUS_CHECK_ASM
{
opus_val32 xy1_c, xy2_c;
- dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
- celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
- celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
+ float err[2];
+ dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
+ /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
+ if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
+ celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
+ celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
}
#endif
}
diff --git a/celt/bands.c b/celt/bands.c
index 2702963c..5320ffab 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -901,7 +901,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
sctx->itheta = itheta;
sctx->qalloc = qalloc;
}
-static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b,
+static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
celt_norm *lowband_out)
{
int c;
@@ -926,7 +926,6 @@ static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
sign = ec_dec_bits(ec, 1);
}
ctx->remaining_bits -= 1<<BITRES;
- b-=1<<BITRES;
}
if (ctx->resynth)
x[0] = sign ? -NORM_SCALING : NORM_SCALING;
@@ -1134,7 +1133,7 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,
/* Special case for one sample */
if (N==1)
{
- return quant_band_n1(ctx, X, NULL, b, lowband_out);
+ return quant_band_n1(ctx, X, NULL, lowband_out);
}
if (tf_change>0)
@@ -1256,7 +1255,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
/* Special case for one sample */
if (N==1)
{
- return quant_band_n1(ctx, X, Y, b, lowband_out);
+ return quant_band_n1(ctx, X, Y, lowband_out);
}
orig_fill = fill;
@@ -1381,6 +1380,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
return cm;
}
+#ifndef DISABLE_UPDATE_DRAFT
static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm *norm2, int start, int M, int dual_stereo)
{
int n1, n2;
@@ -1393,6 +1393,7 @@ static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm
if (dual_stereo)
OPUS_COPY(&norm2[n1], &norm2[2*n1 - n2], n2-n1);
}
+#endif
void quant_all_bands(int encode, const CELTMode *m, int start, int end,
celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 74ca3b74..883dae15 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -90,7 +90,7 @@ struct OpusCustomDecoder {
opus_uint32 rng;
int error;
int last_pitch_index;
- int loss_count;
+ int loss_duration;
int skip_plc;
int postfilter_period;
int postfilter_period_old;
@@ -512,7 +512,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
int nbEBands;
int overlap;
int start;
- int loss_count;
+ int loss_duration;
int noise_based;
const opus_int16 *eBands;
SAVE_STACK;
@@ -532,9 +532,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
oldLogE2 = oldLogE + 2*nbEBands;
backgroundLogE = oldLogE2 + 2*nbEBands;
- loss_count = st->loss_count;
+ loss_duration = st->loss_duration;
start = st->start;
- noise_based = loss_count >= 5 || start != 0 || st->skip_plc;
+ noise_based = loss_duration >= 40 || start != 0 || st->skip_plc;
if (noise_based)
{
/* Noise-based PLC/CNG */
@@ -557,9 +557,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
#else
ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */
#endif
+ c=0; do {
+ OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
+ DECODE_BUFFER_SIZE-N+(overlap>>1));
+ } while (++c<C);
/* Energy decay */
- decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
+ decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
c=0; do
{
for (i=start;i<end;i++)
@@ -585,11 +589,6 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
}
st->rng = seed;
- c=0; do {
- OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
- DECODE_BUFFER_SIZE-N+(overlap>>1));
- } while (++c<C);
-
celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
} else {
int exc_length;
@@ -602,7 +601,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
VARDECL(opus_val16, _exc);
VARDECL(opus_val16, fir_tmp);
- if (loss_count == 0)
+ if (loss_duration == 0)
{
st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
} else {
@@ -630,9 +629,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
buf = decode_mem[c];
for (i=0;i<MAX_PERIOD+LPC_ORDER;i++)
- exc[i-LPC_ORDER] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
+ exc[i-LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
- if (loss_count == 0)
+ if (loss_duration == 0)
{
opus_val32 ac[LPC_ORDER+1];
/* Compute LPC coefficients for the last MAX_PERIOD samples before
@@ -732,7 +731,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
exc[extrapolation_offset+j])), SIG_SHIFT);
/* Compute the energy of the previously decoded signal whose
excitation we're copying. */
- tmp = ROUND16(
+ tmp = SROUND16(
buf[DECODE_BUFFER_SIZE-MAX_PERIOD-N+extrapolation_offset+j],
SIG_SHIFT);
S1 += SHR32(MULT16_16(tmp, tmp), 10);
@@ -742,7 +741,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
/* Copy the last decoded samples (prior to the overlap region) to
synthesis filter memory so we can have a continuous signal. */
for (i=0;i<LPC_ORDER;i++)
- lpc_mem[i] = ROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
+ lpc_mem[i] = SROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
/* Apply the synthesis filter to convert the excitation back into
the signal domain. */
celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
@@ -761,7 +760,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
opus_val32 S2=0;
for (i=0;i<extrapolation_len;i++)
{
- opus_val16 tmp = ROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
+ opus_val16 tmp = SROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
S2 += SHR32(MULT16_16(tmp, tmp), 10);
}
/* This checks for an "explosion" in the synthesis. */
@@ -812,7 +811,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
} while (++c<C);
}
- st->loss_count = loss_count+1;
+ /* Saturate to soemthing large to avoid wrap-around. */
+ st->loss_duration = IMIN(10000, loss_duration+(1<<LM));
RESTORE_STACK;
}
@@ -868,6 +868,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
int nbEBands;
int overlap;
const opus_int16 *eBands;
+ opus_val16 max_background_increase;
ALLOC_STACK;
VALIDATE_CELT_DECODER(st);
@@ -942,7 +943,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
/* Check if there are at least two packets received consecutively before
* turning on the pitch-based PLC */
- st->skip_plc = st->loss_count != 0;
+ st->skip_plc = st->loss_duration != 0;
if (dec == NULL)
{
@@ -1140,25 +1141,21 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
if (C==1)
OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);
- /* In case start or end were to change */
if (!isTransient)
{
- opus_val16 max_background_increase;
OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands);
OPUS_COPY(oldLogE, oldBandE, 2*nbEBands);
- /* In normal circumstances, we only allow the noise floor to increase by
- up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB
- increase for each update.*/
- if (st->loss_count < 10)
- max_background_increase = M*QCONST16(0.001f,DB_SHIFT);
- else
- max_background_increase = QCONST16(1.f,DB_SHIFT);
- for (i=0;i<2*nbEBands;i++)
- backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
} else {
for (i=0;i<2*nbEBands;i++)
oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
}
+ /* In normal circumstances, we only allow the noise floor to increase by
+ up to 2.4 dB/second, but when we're in DTX we give the weight of
+ all missing packets to the update packet. */
+ max_background_increase = IMIN(160, st->loss_duration+M)*QCONST16(0.001f,DB_SHIFT);
+ for (i=0;i<2*nbEBands;i++)
+ backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
+ /* In case start or end were to change */
c=0; do
{
for (i=0;i<start;i++)
@@ -1175,7 +1172,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
st->rng = dec->rng;
deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
- st->loss_count = 0;
+ st->loss_duration = 0;
RESTORE_STACK;
if (ec_tell(dec) > 8*len)
return OPUS_INTERNAL_ERROR;
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index d6f8afc2..637d442c 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -1719,8 +1719,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch);
compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
amp2Log2(mode, effEnd, end, bandE, bandLogE2, C);
- for (i=0;i<C*nbEBands;i++)
- bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+ for (c=0;c<C;c++)
+ {
+ for (i=0;i<end;i++)
+ bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+ }
}
compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
@@ -1856,8 +1859,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
/* Compensate for the scaling of short vs long mdcts */
- for (i=0;i<C*nbEBands;i++)
- bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+ for (c=0;c<C;c++)
+ {
+ for (i=0;i<end;i++)
+ bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+ }
tf_estimate = QCONST16(.2f,14);
}
}
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index 8ecb693e..f91721bc 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -50,17 +50,21 @@ int p
#endif
OPUS_CLEAR(lpc, p);
+#ifdef FIXED_POINT
if (ac[0] != 0)
+#else
+ if (ac[0] > 1e-10f)
+#endif
{
for (i = 0; i < p; i++) {
/* Sum up this iteration's reflection coefficient */
opus_val32 rr = 0;
for (j = 0; j < i; j++)
rr += MULT32_32_Q31(lpc[j],ac[i - j]);
- rr += SHR32(ac[i + 1],3);
- r = -frac_div32(SHL32(rr,3), error);
+ rr += SHR32(ac[i + 1],6);
+ r = -frac_div32(SHL32(rr,6), error);
/* Update LPC coefficients and total error */
- lpc[i] = SHR32(r,3);
+ lpc[i] = SHR32(r,6);
for (j = 0; j < (i+1)>>1; j++)
{
opus_val32 tmp1, tmp2;
@@ -73,17 +77,61 @@ int p
error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
/* Bail out once we get 30 dB gain */
#ifdef FIXED_POINT
- if (error<SHR32(ac[0],10))
+ if (error<=SHR32(ac[0],10))
break;
#else
- if (error<.001f*ac[0])
+ if (error<=.001f*ac[0])
break;
#endif
}
}
#ifdef FIXED_POINT
- for (i=0;i<p;i++)
- _lpc[i] = ROUND16(lpc[i],16);
+ {
+ /* Convert the int32 lpcs to int16 and ensure there are no wrap-arounds.
+ This reuses the logic in silk_LPC_fit() and silk_bwexpander_32(). Any bug
+ fixes should also be applied there. */
+ int iter, idx = 0;
+ opus_val32 maxabs, absval, chirp_Q16, chirp_minus_one_Q16;
+
+ for (iter = 0; iter < 10; iter++) {
+ maxabs = 0;
+ for (i = 0; i < p; i++) {
+ absval = ABS32(lpc[i]);
+ if (absval > maxabs) {
+ maxabs = absval;
+ idx = i;
+ }
+ }
+ maxabs = PSHR32(maxabs, 13); /* Q25->Q12 */
+
+ if (maxabs > 32767) {
+ maxabs = MIN32(maxabs, 163838);
+ chirp_Q16 = QCONST32(0.999, 16) - DIV32(SHL32(maxabs - 32767, 14),
+ SHR32(MULT32_32_32(maxabs, idx + 1), 2));
+ chirp_minus_one_Q16 = chirp_Q16 - 65536;
+
+ /* Apply bandwidth expansion. */
+ for (i = 0; i < p - 1; i++) {
+ lpc[i] = MULT32_32_Q16(chirp_Q16, lpc[i]);
+ chirp_Q16 += PSHR32(MULT32_32_32(chirp_Q16, chirp_minus_one_Q16), 16);
+ }
+ lpc[p - 1] = MULT32_32_Q16(chirp_Q16, lpc[p - 1]);
+ } else {
+ break;
+ }
+ }
+
+ if (iter == 10) {
+ /* If the coeffs still do not fit into the 16 bit range after 10 iterations,
+ fall back to the A(z)=1 filter. */
+ OPUS_CLEAR(lpc, p);
+ _lpc[0] = 4096; /* Q12 */
+ } else {
+ for (i = 0; i < p; i++) {
+ _lpc[i] = EXTRACT16(PSHR32(lpc[i], 13)); /* Q25->Q12 */
+ }
+ }
+ }
#endif
}
@@ -111,17 +159,17 @@ void celt_fir_c(
sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT);
sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT);
xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
- y[i ] = ROUND16(sum[0], SIG_SHIFT);
- y[i+1] = ROUND16(sum[1], SIG_SHIFT);
- y[i+2] = ROUND16(sum[2], SIG_SHIFT);
- y[i+3] = ROUND16(sum[3], SIG_SHIFT);
+ y[i ] = SROUND16(sum[0], SIG_SHIFT);
+ y[i+1] = SROUND16(sum[1], SIG_SHIFT);
+ y[i+2] = SROUND16(sum[2], SIG_SHIFT);
+ y[i+3] = SROUND16(sum[3], SIG_SHIFT);
}
for (;i<N;i++)
{
opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
for (j=0;j<ord;j++)
sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
- y[i] = ROUND16(sum, SIG_SHIFT);
+ y[i] = SROUND16(sum, SIG_SHIFT);
}
RESTORE_STACK;
}
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index 68fc6067..7b5c56ca 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -43,10 +43,11 @@
*/
#define OPUS_ARCHMASK 3
-#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#elif defined(OPUS_HAVE_RTCD) && \
+ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
(defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
(defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
- (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
+ (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
#include "x86/x86cpu.h"
/* We currently support 5 x86 variants:
diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index f4352952..ef2e5d02 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -167,7 +167,7 @@ static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line)
#define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__)
static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
{
- int res;
+ opus_int32 res;
if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
{
fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line);
@@ -175,7 +175,7 @@ static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
celt_assert(0);
#endif
}
- res = a<<shift;
+ res = (opus_int32)((opus_uint32)a<<shift);
if (!VERIFY_SHORT(res))
{
fprintf (stderr, "SHL16: output is not short: %d in %s: line %d\n", res, file, line);
@@ -214,15 +214,15 @@ static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line)
opus_int64 res;
if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
{
- fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", a, shift, file, line);
+ fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", (long long)a, shift, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
}
- res = a<<shift;
+ res = (opus_int64)((opus_uint64)a<<shift);
if (!VERIFY_INT(res))
{
- fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", a, shift, res, file, line);
+ fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", (long long)a, shift, (long long)res, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
@@ -339,7 +339,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file
opus_uint64 res;
if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
{
- fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+ fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
@@ -347,7 +347,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file
res = a+b;
if (!VERIFY_UINT(res))
{
- fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", res, file, line);
+ fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", (unsigned long long)res, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
@@ -363,14 +363,14 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file
opus_uint64 res;
if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
{
- fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+ fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
}
if (a<b)
{
- fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", a, b, file, line);
+ fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
@@ -378,7 +378,7 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file
res = a-b;
if (!VERIFY_UINT(res))
{
- fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", a, b, res, file, line);
+ fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, (unsigned long long)res, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
@@ -410,6 +410,51 @@ static OPUS_INLINE short MULT16_16_16(int a, int b)
return res;
}
+/* result fits in 32 bits */
+static OPUS_INLINE int MULT32_32_32(opus_int64 a, opus_int64 b)
+{
+ opus_int64 res;
+ if (!VERIFY_INT(a) || !VERIFY_INT(b))
+ {
+ fprintf (stderr, "MULT32_32_32: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
+#ifdef FIXED_DEBUG_ASSERT
+ celt_assert(0);
+#endif
+ }
+ res = a*b;
+ if (!VERIFY_INT(res))
+ {
+ fprintf (stderr, "MULT32_32_32: output is not int: %lld\n", (long long)res);
+#ifdef FIXED_DEBUG_ASSERT
+ celt_assert(0);
+#endif
+ }
+ celt_mips+=5;
+ return res;
+}
+
+static OPUS_INLINE int MULT32_32_Q16(opus_int64 a, opus_int64 b)
+{
+ opus_int64 res;
+ if (!VERIFY_INT(a) || !VERIFY_INT(b))
+ {
+ fprintf (stderr, "MULT32_32_Q16: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
+#ifdef FIXED_DEBUG_ASSERT
+ celt_assert(0);
+#endif
+ }
+ res = ((opus_int64)(a)*(opus_int64)(b)) >> 16;
+ if (!VERIFY_INT(res))
+ {
+ fprintf (stderr, "MULT32_32_Q16: output is not int: %lld*%lld=%lld\n", (long long)a, (long long)b, (long long)res);
+#ifdef FIXED_DEBUG_ASSERT
+ celt_assert(0);
+#endif
+ }
+ celt_mips+=5;
+ return res;
+}
+
#define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__)
static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line)
{
@@ -446,7 +491,7 @@ static OPUS_INLINE int MULT16_32_QX_(int a, opus_int64 b, int Q, char *file, int
celt_assert(0);
#endif
}
- if (ABS32(b)>=((opus_val32)(1)<<(15+Q)))
+ if (ABS32(b)>=((opus_int64)(1)<<(16+Q)))
{
fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line);
#ifdef FIXED_DEBUG_ASSERT
@@ -479,7 +524,7 @@ static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int
celt_assert(0);
#endif
}
- if (ABS32(b)>=((opus_int64)(1)<<(15+Q)))
+ if (ABS32(b)>=((opus_int64)(1)<<(16+Q)))
{
fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n\n", Q, (int)a, (int)b,file, line);
#ifdef FIXED_DEBUG_ASSERT
@@ -786,6 +831,6 @@ static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x)
#undef PRINT_MIPS
-#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0);
+#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", (unsigned long long)celt_mips);} while (0);
#endif
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index 0ecbb899..8f29d46b 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -57,6 +57,13 @@
#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
#endif
+/** 32x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#if OPUS_FAST_INT64
+#define MULT32_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),16))
+#else
+#define MULT32_32_Q16(a,b) (ADD32(ADD32(ADD32((opus_val32)(SHR32(((opus_uint32)((a)&0x0000ffff)*(opus_uint32)((b)&0x0000ffff)),16)), MULT16_16SU(SHR32(a,16),((b)&0x0000ffff))), MULT16_16SU(SHR32(b,16),((a)&0x0000ffff))), SHL32(MULT16_16(SHR32(a,16),SHR32(b,16)),16)))
+#endif
+
/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
#if OPUS_FAST_INT64
#define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31))
@@ -131,6 +138,9 @@
/** 16x16 multiplication where the result fits in 16 bits */
#define MULT16_16_16(a,b) ((((opus_val16)(a))*((opus_val16)(b))))
+/** 32x32 multiplication where the result fits in 32 bits */
+#define MULT32_32_32(a,b) ((((opus_val32)(a))*((opus_val32)(b))))
+
/* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */
/** 16x16 multiplication where the result fits in 32 bits */
#define MULT16_16(a,b) (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b)))
diff --git a/celt/float_cast.h b/celt/float_cast.h
index 9d34976e..8915a5fd 100644
--- a/celt/float_cast.h
+++ b/celt/float_cast.h
@@ -99,7 +99,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
return intgr ;
}
-#elif defined(HAVE_LRINTF)
+#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
/* These defines enable functionality introduced with the 1999 ISO C
** standard. They must be defined before the inclusion of math.h to
@@ -117,7 +117,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
#include <math.h>
#define float2int(x) lrintf(x)
-#elif (defined(HAVE_LRINT))
+#elif defined(HAVE_LRINT) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
#define _ISOC9X_SOURCE 1
#define _ISOC99_SOURCE 1
diff --git a/celt/kiss_fft.h b/celt/kiss_fft.h
index bffa2bfa..267f72f9 100644
--- a/celt/kiss_fft.h
+++ b/celt/kiss_fft.h
@@ -52,6 +52,10 @@ extern "C" {
# define kiss_fft_scalar opus_int32
# define kiss_twiddle_scalar opus_int16
+/* Some 32-bit CPUs would load/store a kiss_twiddle_cpx with a single memory
+ * access, and could benefit from additional alignment.
+ */
+# define KISS_TWIDDLE_CPX_ALIGNMENT (sizeof(opus_int32))
#else
# ifndef kiss_fft_scalar
@@ -62,6 +66,12 @@ extern "C" {
# endif
#endif
+#if defined(__GNUC__) && defined(KISS_TWIDDLE_CPX_ALIGNMENT)
+#define KISS_TWIDDLE_CPX_ALIGNED __attribute__((aligned(KISS_TWIDDLE_CPX_ALIGNMENT)))
+#else
+#define KISS_TWIDDLE_CPX_ALIGNED
+#endif
+
typedef struct {
kiss_fft_scalar r;
kiss_fft_scalar i;
@@ -70,7 +80,7 @@ typedef struct {
typedef struct {
kiss_twiddle_scalar r;
kiss_twiddle_scalar i;
-}kiss_twiddle_cpx;
+} KISS_TWIDDLE_CPX_ALIGNED kiss_twiddle_cpx;
#define MAXFACTORS 8
/* e.g. an fft of length 128 has 4 factors
diff --git a/celt/mathops.h b/celt/mathops.h
index fe29dac1..478ac918 100644
--- a/celt/mathops.h
+++ b/celt/mathops.h
@@ -153,7 +153,7 @@ static OPUS_INLINE float celt_exp2(float x)
float f;
opus_uint32 i;
} res;
- integer = floor(x);
+ integer = (int)floor(x);
if (integer < -50)
return 0;
frac = x-integer;
diff --git a/celt/meson.build b/celt/meson.build
index 370ea1fe..ad95d949 100644
--- a/celt/meson.build
+++ b/celt/meson.build
@@ -10,6 +10,10 @@ celt_neon_intr_sources = sources['CELT_SOURCES_ARM_NEON_INTR']
celt_static_libs = []
+if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD')
+ celt_sources += sources['CELT_SOURCES_X86_RTCD']
+endif
+
foreach intr_name : ['sse', 'sse2', 'sse4_1', 'neon_intr']
have_intr = get_variable('have_' + intr_name)
if not have_intr
@@ -30,7 +34,9 @@ if (intrinsics_support.length() + asm_optimization.length() + inline_optimizatio
endif
if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm
- celt_sources += sources['CELT_SOURCES_ARM']
+ if opus_conf.has('OPUS_HAVE_RTCD')
+ celt_sources += sources['CELT_SOURCES_ARM_RTCD']
+ endif
if have_arm_ne10
celt_sources += sources['CELT_SOURCES_ARM_NE10']
endif
diff --git a/celt/modes.c b/celt/modes.c
index 390c5e8a..23f7cde6 100644
--- a/celt/modes.c
+++ b/celt/modes.c
@@ -173,7 +173,10 @@ static void compute_allocation_table(CELTMode *mode)
mode->nbAllocVectors = BITALLOC_SIZE;
allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands));
if (allocVectors==NULL)
+ {
+ mode->allocVectors = NULL;
return;
+ }
/* Check for standard mode */
if (mode->Fs == 400*(opus_int32)mode->shortMdctSize)
diff --git a/celt/pitch.c b/celt/pitch.c
index 872582a4..7998db41 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -161,17 +161,26 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
shift=0;
if (C==2)
shift++;
-#endif
for (i=1;i<len>>1;i++)
- x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift);
- x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift);
+ x_lp[i] = SHR32(x[0][(2*i-1)], shift+2) + SHR32(x[0][(2*i+1)], shift+2) + SHR32(x[0][2*i], shift+1);
+ x_lp[0] = SHR32(x[0][1], shift+2) + SHR32(x[0][0], shift+1);
if (C==2)
{
for (i=1;i<len>>1;i++)
- x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift);
- x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift);
+ x_lp[i] += SHR32(x[1][(2*i-1)], shift+2) + SHR32(x[1][(2*i+1)], shift+2) + SHR32(x[1][2*i], shift+1);
+ x_lp[0] += SHR32(x[1][1], shift+2) + SHR32(x[1][0], shift+1);
}
-
+#else
+ for (i=1;i<len>>1;i++)
+ x_lp[i] = .25f*x[0][(2*i-1)] + .25f*x[0][(2*i+1)] + .5f*x[0][2*i];
+ x_lp[0] = .25f*x[0][1] + .5f*x[0][0];
+ if (C==2)
+ {
+ for (i=1;i<len>>1;i++)
+ x_lp[i] += .25f*x[1][(2*i-1)] + .25f*x[1][(2*i+1)] + .5f*x[1][2*i];
+ x_lp[0] += .25f*x[1][1] + .5f*x[1][0];
+ }
+#endif
_celt_autocorr(x_lp, ac, NULL, 0,
4, len>>1, arch);
@@ -249,7 +258,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 maxcorr=1;
#endif
celt_assert(max_pitch>0);
- celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+ celt_sig_assert(((size_t)_x&3)==0);
for (i=0;i<max_pitch-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
diff --git a/celt/rate.c b/celt/rate.c
index 465e1ba2..7f7ad3fa 100644
--- a/celt/rate.c
+++ b/celt/rate.c
@@ -356,6 +356,8 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
else
depth_threshold = 0;
#ifdef FUZZING
+ (void)signalBandwidth;
+ (void)depth_threshold;
if ((rand()&0x1) == 0)
#else
if (codedBands<=start+2 || (band_bits > (depth_threshold*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 70f8f493..ae9a7b56 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -144,8 +144,9 @@ void test1d(int nfft,int isinverse,int arch)
int main(int argc,char ** argv)
{
+ int arch;
ALLOC_STACK;
- int arch = opus_select_arch();
+ arch = opus_select_arch();
if (argc>1) {
int k;
diff --git a/celt/tests/test_unit_entropy.c b/celt/tests/test_unit_entropy.c
index 7f674529..b1619b74 100644
--- a/celt/tests/test_unit_entropy.c
+++ b/celt/tests/test_unit_entropy.c
@@ -104,7 +104,7 @@ int main(int _argc,char **_argv){
nbits=ec_tell_frac(&enc);
ec_enc_done(&enc);
fprintf(stderr,
- "Encoded %0.2lf bits of entropy to %0.2lf bits (%0.3lf%% wasted).\n",
+ "Encoded %0.2f bits of entropy to %0.2f bits (%0.3f%% wasted).\n",
entropy,ldexp(nbits,-3),100*(nbits-ldexp(entropy,3))/nbits);
fprintf(stderr,"Packed to %li bytes.\n",(long)ec_range_bytes(&enc));
ec_dec_init(&dec,ptr,DATA_SIZE);
@@ -129,7 +129,7 @@ int main(int _argc,char **_argv){
nbits2=ec_tell_frac(&dec);
if(nbits!=nbits2){
fprintf(stderr,
- "Reported number of bits used was %0.2lf, should be %0.2lf.\n",
+ "Reported number of bits used was %0.2f, should be %0.2f.\n",
ldexp(nbits2,-3),ldexp(nbits,-3));
ret=-1;
}
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 4a563ccf..844c5b48 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -184,8 +184,9 @@ void test1d(int nfft,int isinverse,int arch)
int main(int argc,char ** argv)
{
+ int arch;
ALLOC_STACK;
- int arch = opus_select_arch();
+ arch = opus_select_arch();
if (argc>1) {
int k;
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
index 7d1ecf75..90e69ecf 100644
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -33,7 +33,6 @@
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
-#define OVERRIDE_CELT_FIR
void celt_fir_sse4_1(
const opus_val16 *x,
@@ -44,10 +43,11 @@ void celt_fir_sse4_1(
int arch);
#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define OVERRIDE_CELT_FIR
#define celt_fir(x, num, y, N, ord, arch) \
((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
-#else
+#elif defined(OPUS_HAVE_RTCD)
extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
@@ -57,6 +57,7 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
int ord,
int arch);
+#define OVERRIDE_CELT_FIR
# define celt_fir(x, num, y, N, ord, arch) \
((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index f7a014b6..964aef50 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -63,7 +63,7 @@ void xcorr_kernel_sse(
#define xcorr_kernel(x, y, sum, len, arch) \
((void)arch, xcorr_kernel_sse(x, y, sum, len))
-#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) && ((defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
@@ -115,8 +115,8 @@ opus_val32 celt_inner_prod_sse(
((void)arch, celt_inner_prod_sse(x, y, N))
-#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
- (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) && (((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
index a092c68b..2bc57830 100644
--- a/celt/x86/pitch_sse4_1.c
+++ b/celt/x86/pitch_sse4_1.c
@@ -117,6 +117,14 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
__m128i sum0, sum1, sum2, sum3, vecSum;
__m128i initSum;
+#ifdef OPUS_CHECK_ASM
+ opus_val32 sum_c[4];
+ for (j=0;j<4;j++) {
+ sum_c[j] = sum[j];
+ }
+ xcorr_kernel_c(x, y, sum_c, len);
+#endif
+
celt_assert(len >= 3);
sum0 = _mm_setzero_si128();
@@ -177,19 +185,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
vecSum = _mm_add_epi32(vecSum, sum2);
}
- for (;j<len;j++)
+ vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
+ if (len - j == 3)
{
- vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
- vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+ vecX0 = _mm_shuffle_epi32(vecX, 0x55);
+ vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX2 = _mm_shuffle_epi32(vecX, 0xff);
vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+ vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+ sum2 = _mm_mullo_epi32(vecX2, vecY2);
+
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum1);
+ vecSum = _mm_add_epi32(vecSum, sum2);
+ }
+ else if (len - j == 2)
+ {
+ vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX1 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum1);
+ }
+ else if (len - j == 1)
+ {
+ vecX0 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+
vecSum = _mm_add_epi32(vecSum, sum0);
}
initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
initSum = _mm_add_epi32(initSum, vecSum);
_mm_storeu_si128((__m128i *)sum, initSum);
+
+#ifdef OPUS_CHECK_ASM
+ celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
+#endif
}
#endif
diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
index 080eb25e..6a1914de 100644
--- a/celt/x86/x86cpu.c
+++ b/celt/x86/x86cpu.c
@@ -35,11 +35,11 @@
#include "pitch.h"
#include "x86cpu.h"
-#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#if defined(OPUS_HAVE_RTCD) && \
+ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
(defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
(defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
- (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
-
+ (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
#if defined(_MSC_VER)
@@ -68,7 +68,8 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
"=r" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3]) :
- "0" (InfoType)
+ /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+ "0" (InfoType), "2" (0)
);
#else
__asm__ __volatile__ (
@@ -77,11 +78,22 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
"=b" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3]) :
- "0" (InfoType)
+ /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+ "0" (InfoType), "2" (0)
);
#endif
#elif defined(CPU_INFO_BY_C)
- __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+ /* We use __get_cpuid_count to clear ECX to avoid a valgrind false-positive
+ prior to v3.17.0.*/
+ if (!__get_cpuid_count(InfoType, 0, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) {
+ /* Our function cannot fail, but __get_cpuid{_count} can.
+ Returning all zeroes will effectively disable all SIMD, which is
+ what we want on CPUs that don't support CPUID. */
+ CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0;
+ }
+#else
+# error "Configured to use x86 RTCD, but no CPU detection method available. " \
+ "Reconfigure with --disable-rtcd (or send patches)."
#endif
}
@@ -98,7 +110,7 @@ typedef struct CPU_Feature{
static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
{
- unsigned int info[4] = {0};
+ unsigned int info[4];
unsigned int nIds = 0;
cpuid(info, 0);
@@ -119,7 +131,7 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
}
}
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
{
CPU_Feature cpu_feature;
int arch;
@@ -154,4 +166,13 @@ int opus_select_arch(void)
return arch;
}
+int opus_select_arch(void) {
+ int arch = opus_select_arch_impl();
+#ifdef FUZZING
+ /* Randomly downgrade the architecture. */
+ arch = rand()%(arch+1);
+#endif
+ return arch;
+}
+
#endif
diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
index 1e2bf17b..04e80489 100644
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@@ -56,40 +56,18 @@
int opus_select_arch(void);
# endif
-/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
- or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
- actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
- reference, these require 16-byte alignment and load a full 16 bytes (instead
- of 4 or 8), possibly reading out of bounds.
-
- We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
- _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
- reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
- optimize this out when optimizations ARE enabled.
-
- Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
- (which is fair, since technically the compiler is always allowed to do the
- dereference before invoking the function implementing the intrinsic).
- However, it is smart enough to eliminate the extra MOVD instruction.
- For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
- the extra MOVQ if it's specified explicitly */
-
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-# define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
-# else
-# define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(*(__m128i *)(x)))
-#endif
-
-/* similar reasoning about the instruction sequence as in the 32-bit macro above,
- */
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-# define OP_CVTEPI16_EPI32_M64(x) \
+/*MOVD should not impose any alignment restrictions, but the C standard does,
+ and UBSan will report errors if we actually make unaligned accesses.
+ Use this to work around those restrictions (which should hopefully all get
+ optimized to a single MOVD instruction).*/
+#define OP_LOADU_EPI32(x) \
+ (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\
+ *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U))
+
+#define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x))))
+
+#define OP_CVTEPI16_EPI32_M64(x) \
(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
-# else
-# define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(*(__m128i *)(x)))
-# endif
#endif
diff --git a/celt_sources.mk b/celt_sources.mk
index c9dab06e..d6b6765b 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -18,9 +18,11 @@ celt/quant_bands.c \
celt/rate.c \
celt/vq.c
-CELT_SOURCES_SSE = \
+CELT_SOURCES_X86_RTCD = \
celt/x86/x86cpu.c \
-celt/x86/x86_celt_map.c \
+celt/x86/x86_celt_map.c
+
+CELT_SOURCES_SSE = \
celt/x86/pitch_sse.c
CELT_SOURCES_SSE2 = \
@@ -31,7 +33,7 @@ CELT_SOURCES_SSE4_1 = \
celt/x86/celt_lpc_sse4_1.c \
celt/x86/pitch_sse4_1.c
-CELT_SOURCES_ARM = \
+CELT_SOURCES_ARM_RTCD = \
celt/arm/armcpu.c \
celt/arm/arm_celt_map.c
diff --git a/cmake/OpusConfig.cmake b/cmake/OpusConfig.cmake
index 8d19a535..b82307a1 100644
--- a/cmake/OpusConfig.cmake
+++ b/cmake/OpusConfig.cmake
@@ -9,16 +9,18 @@ configure_file(cmake/config.h.cmake.in config.h @ONLY)
add_definitions(-DHAVE_CONFIG_H)
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-set_property(GLOBAL PROPERTY C_STANDARD 99)
if(MSVC)
- add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+ # For compilers that have no notion of a C standard level,
+ # such as Microsoft Visual C++ before VS 16.7,
+ # this property has no effect.
+ set(CMAKE_C_STANDARD 11)
+else()
+ set(CMAKE_C_STANDARD 99)
endif()
-include(CheckLibraryExists)
-check_library_exists(m floor "" HAVE_LIBM)
-if(HAVE_LIBM)
- list(APPEND OPUS_REQUIRED_LIBRARIES m)
+if(MSVC)
+ add_definitions(-D_CRT_SECURE_NO_WARNINGS)
endif()
include(CFeatureCheck)
@@ -35,9 +37,18 @@ else()
check_symbol_exists(alloca "stdlib.h;malloc.h" USE_ALLOCA_SUPPORTED)
endif()
-include(CheckFunctionExists)
-check_function_exists(lrintf HAVE_LRINTF)
-check_function_exists(lrint HAVE_LRINT)
+include(CMakePushCheckState)
+cmake_push_check_state(RESET)
+include(CheckLibraryExists)
+check_library_exists(m floor "" HAVE_LIBM)
+if(HAVE_LIBM)
+ list(APPEND OPUS_REQUIRED_LIBRARIES m)
+ set(CMAKE_REQUIRED_LIBRARIES m)
+endif()
+
+check_symbol_exists(lrintf "math.h" HAVE_LRINTF)
+check_symbol_exists(lrint "math.h" HAVE_LRINT)
+cmake_pop_check_state()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(i[0-9]86|x86|X86|amd64|AMD64|x86_64)")
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
diff --git a/cmake/OpusFunctions.cmake b/cmake/OpusFunctions.cmake
index fcf3351f..3f22ad81 100644
--- a/cmake/OpusFunctions.cmake
+++ b/cmake/OpusFunctions.cmake
@@ -142,14 +142,28 @@ function(opus_detect_neon COMPILER_SUPPORT_NEON)
endfunction()
function(opus_supports_cpu_detection RUNTIME_CPU_CAPABILITY_DETECTION)
- if(MSVC)
- check_include_file(intrin.h HAVE_INTRIN_H)
- else()
- check_include_file(cpuid.h HAVE_CPUID_H)
- endif()
- if(HAVE_INTRIN_H OR HAVE_CPUID_H)
- set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE)
- elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64)")
+ set(RUNTIME_CPU_CAPABILITY_DETECTION 0 PARENT_SCOPE)
+ if(OPUS_CPU_X86 OR OPUS_CPU_X64)
+ if(MSVC)
+ check_include_file(intrin.h HAVE_INTRIN_H)
+ if(HAVE_INTRIN_H)
+ # if intrin.h is available we assume __cpuid is there
+ set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE)
+ endif()
+ else()
+ include(CFeatureCheck)
+ c_feature_check(CPU_INFO_BY_ASM)
+ set(CPU_INFO_BY_ASM_SUPPORTED ${CPU_INFO_BY_ASM_SUPPORTED} PARENT_SCOPE)
+ check_include_file(cpuid.h HAVE_CPUID_H)
+ if(HAVE_CPUID_H)
+ c_feature_check(CPU_INFO_BY_C)
+ set(CPU_INFO_BY_C_SUPPORTED ${CPU_INFO_BY_C_SUPPORTED} PARENT_SCOPE)
+ endif()
+ if(CPU_INFO_BY_ASM_SUPPORTED OR CPU_INFO_BY_C_SUPPORTED)
+ set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE)
+ endif()
+ endif()
+ elseif(OPUS_CPU_ARM)
# ARM cpu detection is implemented for Windows and anything
# using a Linux kernel (such as Android).
if (CMAKE_SYSTEM_NAME MATCHES "(Windows|Linux|Android)")
diff --git a/cmake/OpusSources.cmake b/cmake/OpusSources.cmake
index 01e75d1a..b47f8c69 100644
--- a/cmake/OpusSources.cmake
+++ b/cmake/OpusSources.cmake
@@ -9,9 +9,11 @@ get_opus_sources(SILK_HEAD silk_headers.mk silk_headers)
get_opus_sources(SILK_SOURCES silk_sources.mk silk_sources)
get_opus_sources(SILK_SOURCES_FLOAT silk_sources.mk silk_sources_float)
get_opus_sources(SILK_SOURCES_FIXED silk_sources.mk silk_sources_fixed)
+get_opus_sources(SILK_SOURCES_X86_RTCD silk_sources.mk silk_sources_x86_rtcd)
get_opus_sources(SILK_SOURCES_SSE4_1 silk_sources.mk silk_sources_sse4_1)
get_opus_sources(SILK_SOURCES_FIXED_SSE4_1 silk_sources.mk
silk_sources_fixed_sse4_1)
+get_opus_sources(SILK_SOURCES_ARM_RTCD silk_sources.mk silk_sources_arm_rtcd)
get_opus_sources(SILK_SOURCES_ARM_NEON_INTR silk_sources.mk
silk_sources_arm_neon_intr)
get_opus_sources(SILK_SOURCES_FIXED_ARM_NEON_INTR silk_sources.mk
@@ -23,10 +25,11 @@ get_opus_sources(OPUS_SOURCES_FLOAT opus_sources.mk opus_sources_float)
get_opus_sources(CELT_HEAD celt_headers.mk celt_headers)
get_opus_sources(CELT_SOURCES celt_sources.mk celt_sources)
+get_opus_sources(CELT_SOURCES_X86_RTCD celt_sources.mk celt_sources_x86_rtcd)
get_opus_sources(CELT_SOURCES_SSE celt_sources.mk celt_sources_sse)
get_opus_sources(CELT_SOURCES_SSE2 celt_sources.mk celt_sources_sse2)
get_opus_sources(CELT_SOURCES_SSE4_1 celt_sources.mk celt_sources_sse4_1)
-get_opus_sources(CELT_SOURCES_ARM celt_sources.mk celt_sources_arm)
+get_opus_sources(CELT_SOURCES_ARM_RTCD celt_sources.mk celt_sources_arm_rtcd)
get_opus_sources(CELT_SOURCES_ARM_ASM celt_sources.mk celt_sources_arm_asm)
get_opus_sources(CELT_AM_SOURCES_ARM_ASM celt_sources.mk
celt_am_sources_arm_asm)
diff --git a/cmake/RunTest.cmake b/cmake/RunTest.cmake
new file mode 100644
index 00000000..f6f8b4a2
--- /dev/null
+++ b/cmake/RunTest.cmake
@@ -0,0 +1,61 @@
+if(NOT EXISTS ${TEST_EXECUTABLE})
+ message(FATAL_ERROR "Error could not find ${TEST_EXECUTABLE}, ensure that you built the test binary")
+endif()
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+
+ # support to run plain old binary on android devices
+ # requires android debug bridge to be installed
+
+ find_program(adb_executable adb)
+ if(NOT adb_executable)
+ message(FATAL_ERROR "Error could not find adb")
+ endif()
+
+ # check if any device emulator is attached
+ execute_process(COMMAND ${adb_executable} shell echo RESULT_VARIABLE CMD_RESULT)
+ if(CMD_RESULT)
+ message(FATAL_ERROR "Error adb: no devices/emulators found")
+ endif()
+
+ # push binary
+ set(android_path /data/local/tmp)
+ execute_process(COMMAND ${adb_executable} push ${TEST_EXECUTABLE} ${android_path} RESULT_VARIABLE CMD_RESULT)
+ if(CMD_RESULT)
+ message(FATAL_ERROR "Error running ${adb_executable} push ${TEST_EXECUTABLE} ${android_path} failed with result ${CMD_RESULT}")
+ endif()
+
+ # set permissions
+ get_filename_component(test_executable ${TEST_EXECUTABLE} NAME)
+ set(test_executable_on_android /data/local/tmp/${test_executable})
+ execute_process(COMMAND ${adb_executable} shell chmod 555 ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT)
+ if(CMD_RESULT)
+ message(FATAL_ERROR "Error running ${adb_executable} shell chmod 555 ${test_executable_on_android} failed with result ${CMD_RESULT}")
+ endif()
+
+ # run executable
+ execute_process(COMMAND ${adb_executable} shell ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT)
+ if(CMD_RESULT)
+ message(FATAL_ERROR "Error running ${adb_executable} shell ${test_executable_on_android} failed with result ${CMD_RESULT}")
+ endif()
+
+ # clean up binary
+ execute_process(COMMAND ${adb_executable} shell rm ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT)
+ if(CMD_RESULT)
+ message(FATAL_ERROR "Error running ${adb_executable} shell rm ${test_executable_on_android} failed with result ${CMD_RESULT}")
+ endif()
+
+elseif(CMAKE_SYSTEM_NAME STREQUAL "iOS")
+ # CTest doesn't support iOS
+
+ message(FATAL_ERROR "Error CTest is not supported on iOS")
+
+else()
+ # for other platforms just execute test binary on host
+
+ execute_process(COMMAND ${TEST_EXECUTABLE} RESULT_VARIABLE CMD_RESULT)
+ if(CMD_RESULT)
+ message(FATAL_ERROR "Error running ${TEST_EXECUTABLE} failed with result ${CMD_RESULT}")
+ endif()
+
+endif() \ No newline at end of file
diff --git a/cmake/cpu_info_by_asm.c b/cmake/cpu_info_by_asm.c
new file mode 100644
index 00000000..1a70a815
--- /dev/null
+++ b/cmake/cpu_info_by_asm.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+int main() {
+ unsigned int CPUInfo0;
+ unsigned int CPUInfo1;
+ unsigned int CPUInfo2;
+ unsigned int CPUInfo3;
+ unsigned int InfoType;
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx is PIC register in 32-bit, so mustn't clobber it. */
+ __asm__ __volatile__ (
+ "xchg %%ebx, %1\n"
+ "cpuid\n"
+ "xchg %%ebx, %1\n":
+ "=a" (CPUInfo0),
+ "=r" (CPUInfo1),
+ "=c" (CPUInfo2),
+ "=d" (CPUInfo3) :
+ "0" (InfoType), "2" (0)
+ );
+#else
+ __asm__ __volatile__ (
+ "cpuid":
+ "=a" (CPUInfo0),
+ "=b" (CPUInfo1),
+ "=c" (CPUInfo2),
+ "=d" (CPUInfo3) :
+ "0" (InfoType), "2" (0)
+ );
+#endif
+ return 0;
+}
diff --git a/cmake/cpu_info_by_c.c b/cmake/cpu_info_by_c.c
new file mode 100644
index 00000000..117084eb
--- /dev/null
+++ b/cmake/cpu_info_by_c.c
@@ -0,0 +1,9 @@
+#include <cpuid.h>
+int main() {
+ unsigned int CPUInfo0;
+ unsigned int CPUInfo1;
+ unsigned int CPUInfo2;
+ unsigned int CPUInfo3;
+ unsigned int InfoType;
+ return __get_cpuid_count(InfoType, 0, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
+}
diff --git a/configure.ac b/configure.ac
index f12f0aa9..1d426f27 100644
--- a/configure.ac
+++ b/configure.ac
@@ -195,6 +195,7 @@ AC_ARG_ENABLE([intrinsics],
rtcd_support=no
cpu_arm=no
+cpu_x86=no
AS_IF([test x"${enable_asm}" = x"yes"],[
inline_optimization="No inline ASM for your platform, please send patches"
@@ -535,6 +536,7 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[
],
[i?86|x86_64],
[
+ cpu_x86=yes
OPUS_CHECK_INTRINSICS(
[SSE],
[$X86_SSE_CFLAGS],
@@ -724,7 +726,7 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[
unsigned int CPUInfo2;
unsigned int CPUInfo3;
unsigned int InfoType;
- __get_cpuid(InfoType, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
+ __get_cpuid_count(InfoType, 0, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
]])],
[AC_MSG_RESULT([C method])
AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])],
@@ -744,6 +746,7 @@ AM_CONDITIONAL([HAVE_ARM_NEON_INTR],
[test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"])
AM_CONDITIONAL([HAVE_ARM_NE10],
[test x"$HAVE_ARM_NE10" = x"1"])
+AM_CONDITIONAL([CPU_X86], [test "$cpu_x86" = "yes"])
AM_CONDITIONAL([HAVE_SSE],
[test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"])
AM_CONDITIONAL([HAVE_SSE2],
@@ -753,6 +756,8 @@ AM_CONDITIONAL([HAVE_SSE4_1],
AM_CONDITIONAL([HAVE_AVX],
[test x"$OPUS_X86_MAY_HAVE_AVX" = x"1"])
+AM_CONDITIONAL([HAVE_RTCD],
+ [test x"$enable_rtcd" = x"yes" -a x"$rtcd_support" != x"no"])
AS_IF([test x"$enable_rtcd" = x"yes"],[
AS_IF([test x"$rtcd_support" != x"no"],[
AC_DEFINE([OPUS_HAVE_RTCD], [1],
diff --git a/fuzzer/Android.bp b/fuzzer/Android.bp
index 45ce6ab4..be47f44a 100644
--- a/fuzzer/Android.bp
+++ b/fuzzer/Android.bp
@@ -39,6 +39,14 @@ cc_defaults {
"android-media-fuzzing-reports@google.com",
],
componentid: 155276,
+ hotlists: [
+ "4593311",
+ ],
+ description: "The fuzzer targets the APIs of libopus",
+ vector: "remote",
+ service_privilege: "constrained",
+ users: "multi_user",
+ fuzzed_code_usage: "shipped",
},
}
diff --git a/include/opus.h b/include/opus.h
index d282f21d..0c69c627 100644
--- a/include/opus.h
+++ b/include/opus.h
@@ -198,7 +198,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT int opus_encoder_get_size(int channels);
* This must be one of 8000, 12000, 16000,
* 24000, or 48000.
* @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal
- * @param [in] application <tt>int</tt>: Coding mode (@ref OPUS_APPLICATION_VOIP/@ref OPUS_APPLICATION_AUDIO/@ref OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+ * @param [in] application <tt>int</tt>: Coding mode (one of @ref OPUS_APPLICATION_VOIP, @ref OPUS_APPLICATION_AUDIO, or @ref OPUS_APPLICATION_RESTRICTED_LOWDELAY)
* @param [out] error <tt>int*</tt>: @ref opus_errorcodes
* @note Regardless of the sampling rate and number channels selected, the Opus encoder
* can switch to a lower audio bandwidth or number of channels if the bitrate
@@ -222,7 +222,7 @@ OPUS_EXPORT OPUS_WARN_UNUSED_RESULT OpusEncoder *opus_encoder_create(
* This must be one of 8000, 12000, 16000,
* 24000, or 48000.
* @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal
- * @param [in] application <tt>int</tt>: Coding mode (OPUS_APPLICATION_VOIP/OPUS_APPLICATION_AUDIO/OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+ * @param [in] application <tt>int</tt>: Coding mode (one of OPUS_APPLICATION_VOIP, OPUS_APPLICATION_AUDIO, or OPUS_APPLICATION_RESTRICTED_LOWDELAY)
* @retval #OPUS_OK Success or @ref opus_errorcodes
*/
OPUS_EXPORT int opus_encoder_init(
diff --git a/include/opus_custom.h b/include/opus_custom.h
index 2227be01..2f22d4b3 100644
--- a/include/opus_custom.h
+++ b/include/opus_custom.h
@@ -104,7 +104,8 @@ typedef struct OpusCustomDecoder OpusCustomDecoder;
/** The mode contains all the information necessary to create an
encoder. Both the encoder and decoder need to be initialized
with exactly the same mode, otherwise the output will be
- corrupted.
+ corrupted. The mode MUST NOT BE DESTROYED until the encoders and
+ decoders that use it are destroyed as well.
@brief Mode configuration
*/
typedef struct OpusCustomMode OpusCustomMode;
diff --git a/include/opus_defines.h b/include/opus_defines.h
index ceee5b84..94b9e0d9 100644
--- a/include/opus_defines.h
+++ b/include/opus_defines.h
@@ -482,7 +482,8 @@ extern "C" {
* @param[in] x <tt>opus_int32</tt>: Allowed values:
* <dl>
* <dt>0</dt><dd>Disable inband FEC (default).</dd>
- * <dt>1</dt><dd>Enable inband FEC.</dd>
+ * <dt>1</dt><dd>Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.</dd>
+ * <dt>2</dt><dd>Inband FEC enabled, but does not necessarily switch to SILK if we have music.</dd>
* </dl>
* @hideinitializer */
#define OPUS_SET_INBAND_FEC(x) OPUS_SET_INBAND_FEC_REQUEST, __opus_check_int(x)
@@ -491,7 +492,8 @@ extern "C" {
* @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values:
* <dl>
* <dt>0</dt><dd>Inband FEC disabled (default).</dd>
- * <dt>1</dt><dd>Inband FEC enabled.</dd>
+ * <dt>1</dt><dd>Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.</dd>
+ * <dt>2</dt><dd>Inband FEC enabled, but does not necessarily switch to SILK if we have music.</dd>
* </dl>
* @hideinitializer */
#define OPUS_GET_INBAND_FEC(x) OPUS_GET_INBAND_FEC_REQUEST, __opus_check_int_ptr(x)
diff --git a/libopus_blocklist.txt b/libopus_blocklist.txt
index 51db6112..84c19ee4 100644
--- a/libopus_blocklist.txt
+++ b/libopus_blocklist.txt
@@ -24,7 +24,13 @@ fun:ec_decode_bin
fun:silk_noise_shape_quantizer_del_dec
# silk/NSQ.c:265:25: 1318152552 + 1068143768 cannot be represented in type 'int'
fun:silk_noise_shape_quantizer
-
+# silk/x86/NSQ_del_dec_sse4_1.c:571:28: 1162446838 - -1165932966 cannot be represented in type 'int'
+fun:silk_noise_shape_quantizer_del_dec_sse4_1
+# silk/fixed/x86/burg_modified_FIX_sse4_1.c:277: 1940085720 + 252655088 cannot be represented
+# in type 'int'
+fun:silk_burg_modified_sse4_1
+# silk/fixed/burg_modified_FIX.c:181 1940085720 + 252655088 cannot be represented in type 'int'
+fun:silk_burg_modified_c
src:*/celt/kiss_fft.c
# assembly optimizations that know what they are doing
diff --git a/meson.build b/meson.build
index 41f69353..ed66d380 100644
--- a/meson.build
+++ b/meson.build
@@ -532,9 +532,9 @@ if not opt_intrinsics.disabled()
endif # opt_rtcd
else
if opt_intrinsics.enabled()
- error('intrinsics option enabled, but no intrinsics support for ' + host_machine.get_cpu())
+ error('intrinsics option enabled, but no intrinsics support for ' + host_cpu_family)
endif
- warning('No intrinsics support for ' + host_machine.get_cpu())
+ warning('No intrinsics support for ' + host_cpu_family)
endif
endif
diff --git a/meson/get-version.py b/meson/get-version.py
index 0e8b8623..d3835f13 100755
--- a/meson/get-version.py
+++ b/meson/get-version.py
@@ -31,7 +31,7 @@ if __name__ == '__main__':
# check if git checkout
git_dir = os.path.join(srcroot, '.git')
- is_git = os.path.isdir(git_dir)
+ is_git = os.path.isdir(git_dir) or os.path.isfile(git_dir)
have_git = shutil.which('git') is not None
if is_git and have_git:
diff --git a/opus.m4 b/opus.m4
index 47f5ec49..263470d4 100644
--- a/opus.m4
+++ b/opus.m4
@@ -63,7 +63,7 @@ dnl
#include <string.h>
#include <opus.h>
-int main ()
+int main (void)
{
system("touch conf.opustest");
return 0;
diff --git a/silk/LPC_fit.c b/silk/LPC_fit.c
index cdea4f3a..c0690a1f 100644
--- a/silk/LPC_fit.c
+++ b/silk/LPC_fit.c
@@ -31,7 +31,8 @@ POSSIBILITY OF SUCH DAMAGE.
#include "SigProc_FIX.h"
-/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around */
+/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around.
+ This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */
void silk_LPC_fit(
opus_int16 *a_QOUT, /* O Output signal */
opus_int32 *a_QIN, /* I/O Input signal */
diff --git a/silk/MacroDebug.h b/silk/MacroDebug.h
index 8dd4ce2e..3110da9a 100644
--- a/silk/MacroDebug.h
+++ b/silk/MacroDebug.h
@@ -55,7 +55,7 @@ static OPUS_INLINE opus_int16 silk_ADD16_(opus_int16 a, opus_int16 b, char *file
static OPUS_INLINE opus_int32 silk_ADD32_(opus_int32 a, opus_int32 b, char *file, int line){
opus_int32 ret;
- ret = a + b;
+ ret = (opus_int32)((opus_uint32)a + (opus_uint32)b);
if ( ret != silk_ADD_SAT32( a, b ) )
{
fprintf (stderr, "silk_ADD32(%d, %d) in %s: line %d\n", a, b, file, line);
@@ -101,9 +101,9 @@ static OPUS_INLINE opus_int16 silk_SUB16_(opus_int16 a, opus_int16 b, char *file
#undef silk_SUB32
#define silk_SUB32(a,b) silk_SUB32_((a), (b), __FILE__, __LINE__)
static OPUS_INLINE opus_int32 silk_SUB32_(opus_int32 a, opus_int32 b, char *file, int line){
- opus_int32 ret;
+ opus_int64 ret;
- ret = a - b;
+ ret = a - (opus_int64)b;
if ( ret != silk_SUB_SAT32( a, b ) )
{
fprintf (stderr, "silk_SUB32(%d, %d) in %s: line %d\n", a, b, file, line);
@@ -257,7 +257,7 @@ static OPUS_INLINE opus_int64 silk_SUB_SAT64_( opus_int64 a64, opus_int64 b64, c
static OPUS_INLINE opus_int32 silk_MUL_(opus_int32 a32, opus_int32 b32, char *file, int line){
opus_int32 ret;
opus_int64 ret64;
- ret = a32 * b32;
+ ret = (opus_int32)((opus_uint32)a32 * (opus_uint32)b32);
ret64 = (opus_int64)a32 * (opus_int64)b32;
if ( (opus_int64)ret != ret64 )
{
@@ -333,8 +333,8 @@ static OPUS_INLINE opus_int32 silk_SMULWB_(opus_int32 a32, opus_int32 b32, char
#define silk_SMLAWB(a,b,c) silk_SMLAWB_((a), (b), (c), __FILE__, __LINE__)
static OPUS_INLINE opus_int32 silk_SMLAWB_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){
opus_int32 ret;
- ret = silk_ADD32( a32, silk_SMULWB( b32, c32 ) );
- if ( silk_ADD32( a32, silk_SMULWB( b32, c32 ) ) != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) )
+ ret = silk_ADD32_ovflw( a32, silk_SMULWB( b32, c32 ) );
+ if ( ret != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) )
{
fprintf (stderr, "silk_SMLAWB(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line);
#ifdef FIXED_DEBUG_ASSERT
@@ -465,7 +465,7 @@ static OPUS_INLINE opus_int32 silk_SMULWW_(opus_int32 a32, opus_int32 b32, char
if ( fail )
{
- fprintf (stderr, "silk_SMULWT(%d, %d) in %s: line %d\n", a32, b32, file, line);
+ fprintf (stderr, "silk_SMULWW(%d, %d) in %s: line %d\n", a32, b32, file, line);
#ifdef FIXED_DEBUG_ASSERT
silk_assert( 0 );
#endif
@@ -491,12 +491,6 @@ static OPUS_INLINE opus_int32 silk_SMLAWW_(opus_int32 a32, opus_int32 b32, opus_
return ret;
}
-/* Multiply-accumulate macros that allow overflow in the addition (ie, no asserts in debug mode) */
-#undef silk_MLA_ovflw
-#define silk_MLA_ovflw(a32, b32, c32) ((a32) + ((b32) * (c32)))
-#undef silk_SMLABB_ovflw
-#define silk_SMLABB_ovflw(a32, b32, c32) ((a32) + ((opus_int32)((opus_int16)(b32))) * (opus_int32)((opus_int16)(c32)))
-
/* no checking needed for silk_SMULL
no checking needed for silk_SMLAL
no checking needed for silk_SMLALBB
@@ -546,10 +540,10 @@ static OPUS_INLINE opus_int32 silk_DIV32_16_(opus_int32 a32, opus_int32 b32, cha
static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char *file, int line){
opus_int8 ret;
int fail = 0;
- ret = a << shift;
+ ret = (opus_int8)((opus_uint8)a << shift);
fail |= shift < 0;
fail |= shift >= 8;
- fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+ fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
if ( fail )
{
fprintf (stderr, "silk_LSHIFT8(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -565,10 +559,10 @@ static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char *
static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, char *file, int line){
opus_int16 ret;
int fail = 0;
- ret = a << shift;
+ ret = (opus_int16)((opus_uint16)a << shift);
fail |= shift < 0;
fail |= shift >= 16;
- fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+ fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
if ( fail )
{
fprintf (stderr, "silk_LSHIFT16(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -584,10 +578,10 @@ static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, cha
static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, char *file, int line){
opus_int32 ret;
int fail = 0;
- ret = a << shift;
+ ret = (opus_int32)((opus_uint32)a << shift);
fail |= shift < 0;
fail |= shift >= 32;
- fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+ fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
if ( fail )
{
fprintf (stderr, "silk_LSHIFT32(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -603,7 +597,7 @@ static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, cha
static OPUS_INLINE opus_int64 silk_LSHIFT64_(opus_int64 a, opus_int shift, char *file, int line){
opus_int64 ret;
int fail = 0;
- ret = a << shift;
+ ret = (opus_int64)((opus_uint64)a << shift);
fail |= shift < 0;
fail |= shift >= 64;
fail |= (ret>>shift) != ((opus_int64)a);
@@ -714,8 +708,8 @@ static OPUS_INLINE opus_uint32 silk_RSHIFT_uint_(opus_uint32 a, opus_int32 shift
#define silk_ADD_LSHIFT(a,b,c) silk_ADD_LSHIFT_((a), (b), (c), __FILE__, __LINE__)
static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int line){
opus_int16 ret;
- ret = a + (b << shift);
- if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) )
+ ret = a + (opus_int16)((opus_uint16)b << shift);
+ if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) )
{
fprintf (stderr, "silk_ADD_LSHIFT(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
#ifdef FIXED_DEBUG_ASSERT
@@ -729,8 +723,8 @@ static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int
#define silk_ADD_LSHIFT32(a,b,c) silk_ADD_LSHIFT32_((a), (b), (c), __FILE__, __LINE__)
static OPUS_INLINE opus_int32 silk_ADD_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
opus_int32 ret;
- ret = a + (b << shift);
- if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) )
+ ret = silk_ADD32_ovflw(a, (opus_int32)((opus_uint32)b << shift));
+ if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) )
{
fprintf (stderr, "silk_ADD_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
#ifdef FIXED_DEBUG_ASSERT
@@ -774,7 +768,7 @@ static OPUS_INLINE int silk_ADD_RSHIFT_(int a, int b, int shift, char *file, int
#define silk_ADD_RSHIFT32(a,b,c) silk_ADD_RSHIFT32_((a), (b), (c), __FILE__, __LINE__)
static OPUS_INLINE opus_int32 silk_ADD_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
opus_int32 ret;
- ret = a + (b >> shift);
+ ret = silk_ADD32_ovflw(a, (b >> shift));
if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) >> shift)) )
{
fprintf (stderr, "silk_ADD_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
@@ -804,8 +798,8 @@ static OPUS_INLINE opus_uint32 silk_ADD_RSHIFT_uint_(opus_uint32 a, opus_uint32
#define silk_SUB_LSHIFT32(a,b,c) silk_SUB_LSHIFT32_((a), (b), (c), __FILE__, __LINE__)
static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
opus_int32 ret;
- ret = a - (b << shift);
- if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) << shift)) )
+ ret = silk_SUB32_ovflw(a, (opus_int32)((opus_uint32)b << shift));
+ if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (opus_int64)(((opus_uint64)b) << shift)) )
{
fprintf (stderr, "silk_SUB_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
#ifdef FIXED_DEBUG_ASSERT
@@ -819,7 +813,7 @@ static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opu
#define silk_SUB_RSHIFT32(a,b,c) silk_SUB_RSHIFT32_((a), (b), (c), __FILE__, __LINE__)
static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
opus_int32 ret;
- ret = a - (b >> shift);
+ ret = silk_SUB32_ovflw(a, (b >> shift));
if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) >> shift)) )
{
fprintf (stderr, "silk_SUB_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
@@ -835,7 +829,7 @@ static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opu
static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift, char *file, int line){
opus_int32 ret;
ret = shift == 1 ? (a >> 1) + (a & 1) : ((a >> (shift - 1)) + 1) >> 1;
- /* the marco definition can't handle a shift of zero */
+ /* the macro definition can't handle a shift of zero */
if ( (shift <= 0) || (shift>31) || ((opus_int64)ret != ((opus_int64)a + ((opus_int64)1 << (shift - 1))) >> shift) )
{
fprintf (stderr, "silk_RSHIFT_ROUND(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -850,7 +844,7 @@ static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift,
#define silk_RSHIFT_ROUND64(a,b) silk_RSHIFT_ROUND64_((a), (b), __FILE__, __LINE__)
static OPUS_INLINE opus_int64 silk_RSHIFT_ROUND64_(opus_int64 a, opus_int32 shift, char *file, int line){
opus_int64 ret;
- /* the marco definition can't handle a shift of zero */
+ /* the macro definition can't handle a shift of zero */
if ( (shift <= 0) || (shift>=64) )
{
fprintf (stderr, "silk_RSHIFT_ROUND64(%lld, %d) in %s: line %d\n", (long long)a, shift, file, line);
diff --git a/silk/NSQ.c b/silk/NSQ.c
index 1d64d8e2..45dd45ce 100644
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@@ -75,21 +75,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer(
void silk_NSQ_c
(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int16 x16[], /* I Input */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{
opus_int k, lag, start_idx, LSF_interpolation_flag;
@@ -173,9 +173,9 @@ void silk_NSQ_c
RESTORE_STACK;
}
-/***********************************/
-/* silk_noise_shape_quantizer */
-/***********************************/
+/******************************/
+/* silk_noise_shape_quantizer */
+/******************************/
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
static OPUS_INLINE
@@ -262,7 +262,7 @@ void silk_noise_shape_quantizer(
tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */
if( lag > 0 ) {
/* Symmetric, packed FIR coefficients */
- n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+ n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
shp_lag_ptr++;
diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c
index 00e749c3..41f3fc93 100644
--- a/silk/NSQ_del_dec.c
+++ b/silk/NSQ_del_dec.c
@@ -115,21 +115,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec(
);
void silk_NSQ_del_dec_c(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int16 x16[], /* I Input */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{
opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index a9068908..65fe6a07 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -381,7 +381,7 @@ opus_int32 silk_inner_prod_aligned_scale(
const opus_int len /* I vector lengths */
);
-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
@@ -630,12 +630,14 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
/* the following seems faster on x86 */
#define silk_SMMUL(a32, b32) (opus_int32)silk_RSHIFT64(silk_SMULL((a32), (b32)), 32)
-#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#if !defined(OVERRIDE_silk_burg_modified)
#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#endif
-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
- ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
+#if !defined(OVERRIDE_silk_inner_prod16)
+#define silk_inner_prod16(inVec1, inVec2, len, arch) \
+ ((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len))
#endif
#include "Inlines.h"
diff --git a/silk/VQ_WMat_EC.c b/silk/VQ_WMat_EC.c
index 0f3d545c..245a7e4b 100644
--- a/silk/VQ_WMat_EC.c
+++ b/silk/VQ_WMat_EC.c
@@ -64,7 +64,7 @@ void silk_VQ_WMat_EC_c(
*rate_dist_Q8 = silk_int32_MAX;
*res_nrg_Q15 = silk_int32_MAX;
cb_row_Q7 = cb_Q7;
- /* In things go really bad, at least *ind is set to something safe. */
+ /* If things go really bad, at least *ind is set to something safe. */
*ind = 0;
for( k = 0; k < L; k++ ) {
opus_int32 penalty;
@@ -115,7 +115,7 @@ void silk_VQ_WMat_EC_c(
if( sum1_Q15 >= 0 ) {
/* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
- /* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */
+ /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
if( bits_tot_Q8 <= *rate_dist_Q8 ) {
*rate_dist_Q8 = bits_tot_Q8;
diff --git a/silk/bwexpander_32.c b/silk/bwexpander_32.c
index d0010f73..0f32b9df 100644
--- a/silk/bwexpander_32.c
+++ b/silk/bwexpander_32.c
@@ -31,7 +31,8 @@ POSSIBILITY OF SUCH DAMAGE.
#include "SigProc_FIX.h"
-/* Chirp (bandwidth expand) LP AR filter */
+/* Chirp (bandwidth expand) LP AR filter.
+ This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */
void silk_bwexpander_32(
opus_int32 *ar, /* I/O AR filter to be expanded (without leading 1) */
const opus_int d, /* I Length of ar */
diff --git a/silk/control_codec.c b/silk/control_codec.c
index 52aa8fde..784ffe66 100644
--- a/silk/control_codec.c
+++ b/silk/control_codec.c
@@ -415,7 +415,7 @@ static OPUS_INLINE opus_int silk_setup_LBRR(
/* Previous packet did not have LBRR, and was therefore coded at a higher bitrate */
psEncC->LBRR_GainIncreases = 7;
} else {
- psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.4, 16 ) ), 2 );
+ psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.2, 16 ) ), 3 );
}
}
diff --git a/silk/enc_API.c b/silk/enc_API.c
index 55a33f37..548e0736 100644
--- a/silk/enc_API.c
+++ b/silk/enc_API.c
@@ -270,6 +270,7 @@ opus_int silk_Encode( /* O Returns error co
psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 );
ALLOC( buf, nSamplesFromInputMax, opus_int16 );
while( 1 ) {
+ int curr_nBitsUsedLBRR = 0;
nSamplesToBuffer = psEnc->state_Fxx[ 0 ].sCmn.frame_length - psEnc->state_Fxx[ 0 ].sCmn.inputBufIx;
nSamplesToBuffer = silk_min( nSamplesToBuffer, nSamplesToBufferMax );
nSamplesFromInput = silk_DIV32_16( nSamplesToBuffer * psEnc->state_Fxx[ 0 ].sCmn.API_fs_Hz, psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 );
@@ -342,6 +343,7 @@ opus_int silk_Encode( /* O Returns error co
opus_uint8 iCDF[ 2 ] = { 0, 0 };
iCDF[ 0 ] = 256 - silk_RSHIFT( 256, ( psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket + 1 ) * encControl->nChannelsInternal );
ec_enc_icdf( psRangeEnc, 0, iCDF, 8 );
+ curr_nBitsUsedLBRR = ec_tell( psRangeEnc );
/* Encode any LBRR data from previous packet */
/* Encode LBRR flags */
@@ -386,8 +388,7 @@ opus_int silk_Encode( /* O Returns error co
for( n = 0; n < encControl->nChannelsInternal; n++ ) {
silk_memset( psEnc->state_Fxx[ n ].sCmn.LBRR_flags, 0, sizeof( psEnc->state_Fxx[ n ].sCmn.LBRR_flags ) );
}
-
- psEnc->nBitsUsedLBRR = ec_tell( psRangeEnc );
+ curr_nBitsUsedLBRR = ec_tell( psRangeEnc ) - curr_nBitsUsedLBRR;
}
silk_HP_variable_cutoff( psEnc->state_Fxx );
@@ -396,6 +397,16 @@ opus_int silk_Encode( /* O Returns error co
nBits = silk_DIV32_16( silk_MUL( encControl->bitRate, encControl->payloadSize_ms ), 1000 );
/* Subtract bits used for LBRR */
if( !prefillFlag ) {
+ /* psEnc->nBitsUsedLBRR is an exponential moving average of the LBRR usage,
+ except that for the first LBRR frame it does no averaging and for the first
+ frame after after LBRR, it goes back to zero immediately. */
+ if ( curr_nBitsUsedLBRR < 10 ) {
+ psEnc->nBitsUsedLBRR = 0;
+ } else if ( psEnc->nBitsUsedLBRR < 10) {
+ psEnc->nBitsUsedLBRR = curr_nBitsUsedLBRR;
+ } else {
+ psEnc->nBitsUsedLBRR = ( psEnc->nBitsUsedLBRR + curr_nBitsUsedLBRR ) / 2;
+ }
nBits -= psEnc->nBitsUsedLBRR;
}
/* Divide by number of uncoded frames left in packet */
diff --git a/silk/fixed/LTP_scale_ctrl_FIX.c b/silk/fixed/LTP_scale_ctrl_FIX.c
index 3dcedef8..db1016e0 100644
--- a/silk/fixed/LTP_scale_ctrl_FIX.c
+++ b/silk/fixed/LTP_scale_ctrl_FIX.c
@@ -42,9 +42,14 @@ void silk_LTP_scale_ctrl_FIX(
if( condCoding == CODE_INDEPENDENTLY ) {
/* Only scale if first frame in packet */
- round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket;
- psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT(
- silk_SMULWB( silk_SMULBB( round_loss, psEncCtrl->LTPredCodGain_Q7 ), SILK_FIX_CONST( 0.1, 9 ) ), 0, 2 );
+ round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket;
+ if ( psEnc->sCmn.LBRR_flag ) {
+ /* LBRR reduces the effective loss. In practice, it does not square the loss because
+ losses aren't independent, but that still seems to work best. We also never go below 2%. */
+ round_loss = 2 + silk_SMULBB( round_loss, round_loss ) / 100;
+ }
+ psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 2900-psEnc->sCmn.SNR_dB_Q7 );
+ psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 3900-psEnc->sCmn.SNR_dB_Q7 );
} else {
/* Default is minimum scaling */
psEnc->sCmn.indices.LTP_scaleIndex = 0;
diff --git a/silk/fixed/burg_modified_FIX.c b/silk/fixed/burg_modified_FIX.c
index 274d4b28..185a12b1 100644
--- a/silk/fixed/burg_modified_FIX.c
+++ b/silk/fixed/burg_modified_FIX.c
@@ -68,7 +68,7 @@ void silk_burg_modified_c(
celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
/* Compute autocorrelations, added over subframes */
- C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
+ C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
lz = silk_CLZ64(C0_64);
rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
@@ -87,7 +87,7 @@ void silk_burg_modified_c(
x_ptr = x + s * subfr_length;
for( n = 1; n < D + 1; n++ ) {
C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
- silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+ silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
}
}
} else {
@@ -150,7 +150,7 @@ void silk_burg_modified_c(
C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
- /* We sometimes have get overflows in the multiplications (even beyond +/- 2^32),
+ /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
but they cancel each other and the real result seems to always fit in a 32-bit
signed integer. This was determined experimentally, not theoretically (unfortunately). */
tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
@@ -253,7 +253,7 @@ void silk_burg_modified_c(
if( rshifts > 0 ) {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
- C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+ C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
}
} else {
for( s = 0; s < nb_subfr; s++ ) {
diff --git a/silk/fixed/find_pred_coefs_FIX.c b/silk/fixed/find_pred_coefs_FIX.c
index 606d8633..ad363fb7 100644
--- a/silk/fixed/find_pred_coefs_FIX.c
+++ b/silk/fixed/find_pred_coefs_FIX.c
@@ -42,7 +42,8 @@ void silk_find_pred_coefs_FIX(
{
opus_int i;
opus_int32 invGains_Q16[ MAX_NB_SUBFR ], local_gains[ MAX_NB_SUBFR ];
- opus_int16 NLSF_Q15[ MAX_LPC_ORDER ];
+ /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */
+ opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]={0};
const opus_int16 *x_ptr;
opus_int16 *x_pre_ptr;
VARDECL( opus_int16, LPC_in_pre );
diff --git a/silk/fixed/vector_ops_FIX.c b/silk/fixed/vector_ops_FIX.c
index d9498001..dcf84070 100644
--- a/silk/fixed/vector_ops_FIX.c
+++ b/silk/fixed/vector_ops_FIX.c
@@ -87,7 +87,7 @@ opus_int32 silk_inner_prod_aligned(
#endif
}
-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
diff --git a/silk/fixed/x86/burg_modified_FIX_sse4_1.c b/silk/fixed/x86/burg_modified_FIX_sse4_1.c
index bbb1ce0f..e58bf079 100644
--- a/silk/fixed/x86/burg_modified_FIX_sse4_1.c
+++ b/silk/fixed/x86/burg_modified_FIX_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -42,7 +42,7 @@
#define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
#define QA 25
-#define N_BITS_HEAD_ROOM 2
+#define N_BITS_HEAD_ROOM 3
#define MIN_RSHIFTS -16
#define MAX_RSHIFTS (32 - QA)
@@ -59,7 +59,7 @@ void silk_burg_modified_sse4_1(
int arch /* I Run-time architecture */
)
{
- opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
+ opus_int k, n, s, lz, rshifts, reached_max_gain;
opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
const opus_int16 *x_ptr;
opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ];
@@ -68,6 +68,7 @@ void silk_burg_modified_sse4_1(
opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ];
opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ];
opus_int32 xcorr[ SILK_MAX_ORDER_LPC ];
+ opus_int64 C0_64;
__m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
__m128i CONST1 = _mm_set1_epi32(1);
@@ -75,23 +76,18 @@ void silk_burg_modified_sse4_1(
celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
/* Compute autocorrelations, added over subframes */
- silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
- if( rshifts > MAX_RSHIFTS ) {
- C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
- silk_assert( C0 > 0 );
- rshifts = MAX_RSHIFTS;
+ C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
+ lz = silk_CLZ64(C0_64);
+ rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
+ if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
+ if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
+
+ if (rshifts > 0) {
+ C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
} else {
- lz = silk_CLZ32( C0 ) - 1;
- rshifts_extra = N_BITS_HEAD_ROOM - lz;
- if( rshifts_extra > 0 ) {
- rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
- C0 = silk_RSHIFT32( C0, rshifts_extra );
- } else {
- rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
- C0 = silk_LSHIFT32( C0, -rshifts_extra );
- }
- rshifts += rshifts_extra;
+ C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
}
+
CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
if( rshifts > 0 ) {
@@ -99,7 +95,7 @@ void silk_burg_modified_sse4_1(
x_ptr = x + s * subfr_length;
for( n = 1; n < D + 1; n++ ) {
C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
- silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+ silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
}
}
} else {
@@ -203,8 +199,11 @@ void silk_burg_modified_sse4_1(
C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
- tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
- tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
+ /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
+ but they cancel each other and the real result seems to always fit in a 32-bit
+ signed integer. This was determined experimentally, not theoretically (unfortunately). */
+ tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
+ tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
}
tmp1 = -tmp1; /* Q17 */
@@ -350,7 +349,7 @@ void silk_burg_modified_sse4_1(
if( rshifts > 0 ) {
for( s = 0; s < nb_subfr; s++ ) {
x_ptr = x + s * subfr_length;
- C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+ C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
}
} else {
for( s = 0; s < nb_subfr; s++ ) {
@@ -374,4 +373,28 @@ void silk_burg_modified_sse4_1(
*res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
*res_nrg_Q = -rshifts;
}
+
+#ifdef OPUS_CHECK_ASM
+ {
+ opus_int32 res_nrg_c = 0;
+ opus_int res_nrg_Q_c = 0;
+ opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
+
+ silk_burg_modified_c(
+ &res_nrg_c,
+ &res_nrg_Q_c,
+ A_Q16_c,
+ x,
+ minInvGain_Q30,
+ subfr_length,
+ nb_subfr,
+ D,
+ 0
+ );
+
+ silk_assert( *res_nrg == res_nrg_c );
+ silk_assert( *res_nrg_Q == res_nrg_Q_c );
+ silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
+ }
+#endif
}
diff --git a/silk/fixed/x86/prefilter_FIX_sse.c b/silk/fixed/x86/prefilter_FIX_sse.c
deleted file mode 100644
index 555432cd..00000000
--- a/silk/fixed/x86/prefilter_FIX_sse.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
- OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
-#include "main.h"
-#include "celt/x86/x86cpu.h"
-
-void silk_warped_LPC_analysis_filter_FIX_sse4_1(
- opus_int32 state[], /* I/O State [order + 1] */
- opus_int32 res_Q2[], /* O Residual signal [length] */
- const opus_int16 coef_Q13[], /* I Coefficients [order] */
- const opus_int16 input[], /* I Input signal [length] */
- const opus_int16 lambda_Q16, /* I Warping factor */
- const opus_int length, /* I Length of input signal */
- const opus_int order /* I Filter order (even) */
-)
-{
- opus_int n, i;
- opus_int32 acc_Q11, tmp1, tmp2;
-
- /* Order must be even */
- celt_assert( ( order & 1 ) == 0 );
-
- if (order == 10)
- {
- if (0 == lambda_Q16)
- {
- __m128i coef_Q13_3210, coef_Q13_7654;
- __m128i coef_Q13_0123, coef_Q13_4567;
- __m128i state_0123, state_4567;
- __m128i xmm_product1, xmm_product2;
- __m128i xmm_tempa, xmm_tempb;
-
- register opus_int32 sum;
- register opus_int32 state_8, state_9, state_a;
- register opus_int64 coef_Q13_8, coef_Q13_9;
-
- celt_assert( length > 0 );
-
- coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
- coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
-
- coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
- coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
- coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
- coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
-
- state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
- state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
-
- state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
- state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
- state_8 = state[ 8 ];
- state_9 = state[ 9 ];
- state_a = 0;
-
- for( n = 0; n < length; n++ )
- {
- xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
- xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
-
- xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
- xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
- xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
- xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
-
- xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
- xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
-
- xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
- xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
-
- xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
- xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
- xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
-
- sum = (opus_int32)((coef_Q13_8 * state_8) >> 16);
- sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
-
- xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
- sum += _mm_cvtsi128_si32( xmm_tempa);
- res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
-
- /* move right */
- state_a = state_9;
- state_9 = state_8;
- state_8 = _mm_cvtsi128_si32( state_4567 );
- state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
-
- state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
- }
-
- _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
- _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
- state[ 8 ] = state_8;
- state[ 9 ] = state_9;
- state[ 10 ] = state_a;
-
- return;
- }
- }
-
- for( n = 0; n < length; n++ ) {
- /* Output of lowpass section */
- tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
- state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
- /* Output of allpass section */
- tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
- state[ 1 ] = tmp2;
- acc_Q11 = silk_RSHIFT( order, 1 );
- acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
- /* Loop over allpass sections */
- for( i = 2; i < order; i += 2 ) {
- /* Output of allpass section */
- tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
- state[ i ] = tmp1;
- acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
- /* Output of allpass section */
- tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
- state[ i + 1 ] = tmp2;
- acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
- }
- state[ order ] = tmp1;
- acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
- res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
- }
-}
diff --git a/silk/fixed/x86/vector_ops_FIX_sse4_1.c b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
index c1e90564..a46289bb 100644
--- a/silk/fixed/x86/vector_ops_FIX_sse4_1.c
+++ b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
@@ -36,40 +36,38 @@
#include "SigProc_FIX.h"
#include "pitch.h"
+#include "celt/x86/x86cpu.h"
-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
const opus_int16 *inVec1, /* I input vector 1 */
const opus_int16 *inVec2, /* I input vector 2 */
const opus_int len /* I vector lengths */
)
{
- opus_int i, dataSize8;
+ opus_int i, dataSize4;
opus_int64 sum;
- __m128i xmm_tempa;
- __m128i inVec1_76543210, acc1;
- __m128i inVec2_76543210, acc2;
+ __m128i xmm_prod_20, xmm_prod_31;
+ __m128i inVec1_3210, acc1;
+ __m128i inVec2_3210, acc2;
sum = 0;
- dataSize8 = len & ~7;
+ dataSize4 = len & ~3;
acc1 = _mm_setzero_si128();
acc2 = _mm_setzero_si128();
- for( i = 0; i < dataSize8; i += 8 ) {
- inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
- inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
+ for( i = 0; i < dataSize4; i += 4 ) {
+ inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] );
+ inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] );
+ xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
- /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
- inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
+ inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+ inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+ xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
- xmm_tempa = _mm_cvtepi32_epi64( inVec1_76543210 );
- /* equal shift right 8 bytes */
- inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
- inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
-
- acc1 = _mm_add_epi64( acc1, xmm_tempa );
- acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
+ acc1 = _mm_add_epi64( acc1, xmm_prod_20 );
+ acc2 = _mm_add_epi64( acc2, xmm_prod_31 );
}
acc1 = _mm_add_epi64( acc1, acc2 );
@@ -81,8 +79,15 @@ opus_int64 silk_inner_prod16_aligned_64_sse4_1(
_mm_storel_epi64( (__m128i *)&sum, acc1 );
for( ; i < len; i++ ) {
- sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
+ sum = silk_SMLALBB( sum, inVec1[ i ], inVec2[ i ] );
+ }
+
+#ifdef OPUS_CHECK_ASM
+ {
+ opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len );
+ silk_assert( sum == sum_c );
}
+#endif
return sum;
}
diff --git a/silk/float/LTP_scale_ctrl_FLP.c b/silk/float/LTP_scale_ctrl_FLP.c
index 8dbe29d0..6f30ff09 100644
--- a/silk/float/LTP_scale_ctrl_FLP.c
+++ b/silk/float/LTP_scale_ctrl_FLP.c
@@ -41,8 +41,14 @@ void silk_LTP_scale_ctrl_FLP(
if( condCoding == CODE_INDEPENDENTLY ) {
/* Only scale if first frame in packet */
- round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket;
- psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT( round_loss * psEncCtrl->LTPredCodGain * 0.1f, 0.0f, 2.0f );
+ round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket;
+ if ( psEnc->sCmn.LBRR_flag ) {
+ /* LBRR reduces the effective loss. In practice, it does not square the loss because
+ losses aren't independent, but that still seems to work best. We also never go below 2%. */
+ round_loss = 2 + silk_SMULBB( round_loss, round_loss) / 100;
+ }
+ psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 2900 - psEnc->sCmn.SNR_dB_Q7 );
+ psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 3900 - psEnc->sCmn.SNR_dB_Q7 );
} else {
/* Default is minimum scaling */
psEnc->sCmn.indices.LTP_scaleIndex = 0;
diff --git a/silk/float/find_pred_coefs_FLP.c b/silk/float/find_pred_coefs_FLP.c
index dcf7c520..6f790788 100644
--- a/silk/float/find_pred_coefs_FLP.c
+++ b/silk/float/find_pred_coefs_FLP.c
@@ -44,7 +44,8 @@ void silk_find_pred_coefs_FLP(
silk_float XXLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ];
silk_float xXLTP[ MAX_NB_SUBFR * LTP_ORDER ];
silk_float invGains[ MAX_NB_SUBFR ];
- opus_int16 NLSF_Q15[ MAX_LPC_ORDER ];
+ /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */
+ opus_int16 NLSF_Q15[ MAX_LPC_ORDER ]={0};
const silk_float *x_ptr;
silk_float *x_pre_ptr, LPC_in_pre[ MAX_NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH ];
silk_float minInvGain;
diff --git a/silk/float/wrappers_FLP.c b/silk/float/wrappers_FLP.c
index ad90b874..c0c183e3 100644
--- a/silk/float/wrappers_FLP.c
+++ b/silk/float/wrappers_FLP.c
@@ -190,12 +190,14 @@ void silk_quant_LTP_gains_FLP(
opus_int32 XX_Q17[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ];
opus_int32 xX_Q17[ MAX_NB_SUBFR * LTP_ORDER ];
- for( i = 0; i < nb_subfr * LTP_ORDER * LTP_ORDER; i++ ) {
+ i = 0;
+ do {
XX_Q17[ i ] = (opus_int32)silk_float2int( XX[ i ] * 131072.0f );
- }
- for( i = 0; i < nb_subfr * LTP_ORDER; i++ ) {
+ } while ( ++i < nb_subfr * LTP_ORDER * LTP_ORDER );
+ i = 0;
+ do {
xX_Q17[ i ] = (opus_int32)silk_float2int( xX[ i ] * 131072.0f );
- }
+ } while ( ++i < nb_subfr * LTP_ORDER );
silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, &pred_gain_dB_Q7, XX_Q17, xX_Q17, subfr_len, nb_subfr, arch );
diff --git a/silk/main.h b/silk/main.h
index 1a33eed5..a5f56875 100644
--- a/silk/main.h
+++ b/silk/main.h
@@ -247,21 +247,21 @@ void silk_VQ_WMat_EC_c(
/************************************/
void silk_NSQ_c(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int16 x16[], /* I Input */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
#if !defined(OVERRIDE_silk_NSQ)
@@ -273,21 +273,21 @@ void silk_NSQ_c(
/* Noise shaping using delayed decision */
void silk_NSQ_del_dec_c(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int16 x16[], /* I Input */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
#if !defined(OVERRIDE_silk_NSQ_del_dec)
diff --git a/silk/meson.build b/silk/meson.build
index 70692372..917048b2 100644
--- a/silk/meson.build
+++ b/silk/meson.build
@@ -21,6 +21,16 @@ endif
silk_includes = [opus_includes, include_directories('float', 'fixed')]
silk_static_libs = []
+if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD')
+ silk_sources += sources['SILK_SOURCES_X86_RTCD']
+endif
+
+if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm
+ if opus_conf.has('OPUS_HAVE_RTCD')
+ silk_sources += sources['SILK_SOURCES_ARM_RTCD']
+ endif
+endif
+
foreach intr_name : ['sse4_1', 'neon_intr']
have_intr = get_variable('have_' + intr_name)
if not have_intr
diff --git a/silk/stereo_LR_to_MS.c b/silk/stereo_LR_to_MS.c
index c8226663..751452cb 100644
--- a/silk/stereo_LR_to_MS.c
+++ b/silk/stereo_LR_to_MS.c
@@ -77,7 +77,7 @@ void silk_stereo_LR_to_MS(
ALLOC( LP_mid, frame_length, opus_int16 );
ALLOC( HP_mid, frame_length, opus_int16 );
for( n = 0; n < frame_length; n++ ) {
- sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 );
+ sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 );
LP_mid[ n ] = sum;
HP_mid[ n ] = mid[ n + 1 ] - sum;
}
@@ -86,7 +86,7 @@ void silk_stereo_LR_to_MS(
ALLOC( LP_side, frame_length, opus_int16 );
ALLOC( HP_side, frame_length, opus_int16 );
for( n = 0; n < frame_length; n++ ) {
- sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 );
+ sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 );
LP_side[ n ] = sum;
HP_side[ n ] = side[ n + 1 ] - sum;
}
@@ -207,7 +207,7 @@ void silk_stereo_LR_to_MS(
pred0_Q13 += delta0_Q13;
pred1_Q13 += delta1_Q13;
w_Q24 += deltaw_Q24;
- sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */
+ sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */
sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 ); /* Q8 */
sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */
x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
@@ -217,7 +217,7 @@ void silk_stereo_LR_to_MS(
pred1_Q13 = -pred_Q13[ 1 ];
w_Q24 = silk_LSHIFT( width_Q14, 10 );
for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) {
- sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */
+ sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 ); /* Q11 */
sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 ); /* Q8 */
sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */
x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
diff --git a/silk/stereo_MS_to_LR.c b/silk/stereo_MS_to_LR.c
index 62521a4f..1e01bb6e 100644
--- a/silk/stereo_MS_to_LR.c
+++ b/silk/stereo_MS_to_LR.c
@@ -59,7 +59,7 @@ void silk_stereo_MS_to_LR(
for( n = 0; n < STEREO_INTERP_LEN_MS * fs_kHz; n++ ) {
pred0_Q13 += delta0_Q13;
pred1_Q13 += delta1_Q13;
- sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */
+ sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */
sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 ); /* Q8 */
sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */
x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
@@ -67,7 +67,7 @@ void silk_stereo_MS_to_LR(
pred0_Q13 = pred_Q13[ 0 ];
pred1_Q13 = pred_Q13[ 1 ];
for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) {
- sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */
+ sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 ); /* Q11 */
sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 ); /* Q8 */
sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 ); /* Q8 */
x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
diff --git a/silk/tests/test_unit_LPC_inv_pred_gain.c b/silk/tests/test_unit_LPC_inv_pred_gain.c
index 67067cea..7ca902ad 100644
--- a/silk/tests/test_unit_LPC_inv_pred_gain.c
+++ b/silk/tests/test_unit_LPC_inv_pred_gain.c
@@ -43,6 +43,7 @@ int check_stability(opus_int16 *A_Q12, int order) {
int i;
int j;
int sum_a, sum_abs_a;
+ double y[SILK_MAX_ORDER_LPC] = {0};
sum_a = sum_abs_a = 0;
for( j = 0; j < order; j++ ) {
sum_a += A_Q12[ j ];
@@ -57,7 +58,6 @@ int check_stability(opus_int16 *A_Q12, int order) {
if( sum_abs_a < 4096 ) {
return 1;
}
- double y[SILK_MAX_ORDER_LPC] = {0};
y[0] = 1;
for( i = 0; i < 10000; i++ ) {
double sum = 0;
diff --git a/silk/x86/NSQ_del_dec_sse4_1.c b/silk/x86/NSQ_del_dec_sse4_1.c
index 2c75ede2..a58a76cd 100644
--- a/silk/x86/NSQ_del_dec_sse4_1.c
+++ b/silk/x86/NSQ_del_dec_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -46,6 +46,7 @@ typedef struct {
opus_int32 Shape_Q14[ DECISION_DELAY ];
opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
opus_int32 LF_AR_Q14;
+ opus_int32 Diff_Q14;
opus_int32 Seed;
opus_int32 SeedInit;
opus_int32 RD_Q10;
@@ -56,6 +57,7 @@ typedef struct {
opus_int32 RD_Q10;
opus_int32 xq_Q14;
opus_int32 LF_AR_Q14;
+ opus_int32 Diff_Q14;
opus_int32 sLTP_shp_Q14;
opus_int32 LPC_exc_Q14;
} NSQ_sample_struct;
@@ -66,7 +68,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
- const opus_int32 x_Q3[], /* I Input in Q3 */
+ const opus_int16 x16[], /* I Input */
opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
@@ -112,21 +114,21 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
);
void silk_NSQ_del_dec_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{
opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
@@ -142,8 +144,39 @@ void silk_NSQ_del_dec_sse4_1(
VARDECL( opus_int32, delayedGain_Q10 );
VARDECL( NSQ_del_dec_struct, psDelDec );
NSQ_del_dec_struct *psDD;
+#ifdef OPUS_CHECK_ASM
+ silk_nsq_state NSQ_c;
+ SideInfoIndices psIndices_c;
+ opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+ const opus_int8 *const pulses_a = pulses;
+#endif
SAVE_STACK;
+#ifdef OPUS_CHECK_ASM
+ ( void )pulses_a;
+ silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+ silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+ silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+ silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+ silk_NSQ_del_dec_c(
+ psEncC,
+ &NSQ_c,
+ &psIndices_c,
+ x16,
+ pulses_c,
+ PredCoef_Q12,
+ LTPCoef_Q14,
+ AR_Q13,
+ HarmShapeGain_Q14,
+ Tilt_Q14,
+ LF_shp_Q14,
+ Gains_Q16,
+ pitchL,
+ Lambda_Q10,
+ LTP_scale_Q14
+ );
+#endif
+
/* Set unvoiced lag to the previous one, overwrite later for voiced */
lag = NSQ->lagPrev;
@@ -158,6 +191,7 @@ void silk_NSQ_del_dec_sse4_1(
psDD->SeedInit = psDD->Seed;
psDD->RD_Q10 = 0;
psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14;
+ psDD->Diff_Q14 = NSQ->sDiff_shp_Q14;
psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
@@ -185,8 +219,7 @@ void silk_NSQ_del_dec_sse4_1(
LSF_interpolation_flag = 1;
}
- ALLOC( sLTP_Q15,
- psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+ ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
@@ -198,7 +231,7 @@ void silk_NSQ_del_dec_sse4_1(
for( k = 0; k < psEncC->nb_subfr; k++ ) {
A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
- AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
+ AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
/* Noise shape parameters */
silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@@ -257,7 +290,7 @@ void silk_NSQ_del_dec_sse4_1(
}
}
- silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
+ silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
@@ -265,7 +298,7 @@ void silk_NSQ_del_dec_sse4_1(
Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
- x_Q3 += psEncC->subfr_length;
+ x16 += psEncC->subfr_length;
pulses += psEncC->subfr_length;
pxq += psEncC->subfr_length;
}
@@ -288,6 +321,7 @@ void silk_NSQ_del_dec_sse4_1(
for( i = 0; i < decisionDelay; i++ ) {
last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
+
pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
@@ -298,11 +332,19 @@ void silk_NSQ_del_dec_sse4_1(
/* Update states */
NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
+ NSQ->sDiff_shp_Q14 = psDD->Diff_Q14;
NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
/* Save quantized speech signal */
silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+ silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+ silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+ silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
RESTORE_STACK;
}
@@ -345,6 +387,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+ int rdo_offset;
+
VARDECL( NSQ_sample_pair, psSampleState );
NSQ_del_dec_struct *psDD;
NSQ_sample_struct *psSS;
@@ -356,6 +400,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
celt_assert( nStatesDelayedDecision > 0 );
ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
+ rdo_offset = (Lambda_Q10 >> 1) - 512;
+
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
@@ -407,8 +453,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
/* Long-term shaping */
if( lag > 0 ) {
/* Symmetric, packed FIR coefficients */
- n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
- n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
+ n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+ n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
shp_lag_ptr++;
} else {
@@ -478,7 +524,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
- /* setp 4 */
+ /* step 4 */
psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
@@ -511,9 +557,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
/* Noise shape feedback */
- silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
+ celt_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
/* Output of lowpass section */
- tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
+ tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
/* Output of allpass section */
tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
psDD->sAR2_Q14[ 0 ] = tmp2;
@@ -543,9 +589,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
/* Input minus prediction plus noise feedback */
/* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
- tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
+ tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */
- tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */
+ tmp1 = silk_SUB_SAT32( tmp2, tmp1 ); /* Q13 */
tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */
r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */
@@ -559,6 +605,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
/* Find two quantization level candidates and measure their rate-distortion */
q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+ if (Lambda_Q10 > 2048) {
+ /* For aggressive RDO, the bias becomes more than one pulse. */
+ if (q1_Q10 > rdo_offset) {
+ q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+ } else if (q1_Q10 < -rdo_offset) {
+ q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+ } else if (q1_Q10 < 0) {
+ q1_Q0 = -1;
+ } else {
+ q1_Q0 = 0;
+ }
+ }
if( q1_Q0 > 0 ) {
q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
@@ -612,8 +670,9 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
/* Update states */
- sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
- psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+ psSS[ 0 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
+ sLF_AR_shp_Q14 = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
+ psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14;
psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14;
psSS[ 0 ].xq_Q14 = xq_Q14;
@@ -626,14 +685,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
exc_Q14 = -exc_Q14;
}
-
/* Add predictions */
LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
/* Update states */
- sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
- psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+ psSS[ 1 ].Diff_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
+ sLF_AR_shp_Q14 = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
+ psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14;
psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14;
psSS[ 1 ].xq_Q14 = xq_Q14;
@@ -705,6 +764,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
psDD = &psDelDec[ k ];
psSS = &psSampleState[ k ][ 0 ];
psDD->LF_AR_Q14 = psSS->LF_AR_Q14;
+ psDD->Diff_Q14 = psSS->Diff_Q14;
psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14;
psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10;
@@ -728,7 +788,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
- const opus_int32 x_Q3[], /* I Input in Q3 */
+ const opus_int16 x16[], /* I Input */
opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
@@ -742,51 +802,41 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
)
{
opus_int i, k, lag;
- opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
+ opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
NSQ_del_dec_struct *psDD;
- __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+ __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
lag = pitchL[ subfr ];
inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
-
silk_assert( inv_gain_Q31 != 0 );
- /* Calculate gain adjustment factor */
- if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
- gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
- } else {
- gain_adj_Q16 = (opus_int32)1 << 16;
- }
-
/* Scale input */
- inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+ inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
- /* prepare inv_gain_Q23 in packed 4 32-bits */
- xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+ /* prepare inv_gain_Q26 in packed 4 32-bits */
+ xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
- xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+ xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
+
/* equal shift right 4 bytes*/
- xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+ xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
- xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
- xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+ xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+ xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
- xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
- xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+ xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+ xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
- xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+ xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
- _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
+ _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
}
for( ; i < psEncC->subfr_length; i++ ) {
- x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+ x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
}
- /* Save inverse gain */
- NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
if( NSQ->rewhite_flag ) {
if( subfr == 0 ) {
@@ -800,7 +850,9 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
}
/* Adjust for changing gain */
- if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+ if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+ gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
/* Scale long-term shaping state */
{
__m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
@@ -841,6 +893,7 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
/* Scale scalar states */
psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
+ psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );
/* Scale short-term prediction and shaping states */
for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@@ -855,5 +908,8 @@ static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
}
}
}
+
+ /* Save inverse gain */
+ NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
}
}
diff --git a/silk/x86/NSQ_sse4_1.c b/silk/x86/NSQ_sse4_1.c
index b0315e35..d5ae1d3b 100644
--- a/silk/x86/NSQ_sse4_1.c
+++ b/silk/x86/NSQ_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -37,17 +37,17 @@
#include "stack_alloc.h"
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- const opus_int32 x_Q3[], /* I input in Q3 */
- opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
- const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
- opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
- opus_int subfr, /* I subframe number */
- const opus_int LTP_scale_Q14, /* I */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
- const opus_int signal_type /* I Signal type */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ const opus_int16 x16[], /* I input */
+ opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
+ const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
+ opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
+ opus_int subfr, /* I subframe number */
+ const opus_int LTP_scale_Q14, /* I */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
+ const opus_int signal_type /* I Signal type */
);
static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
@@ -65,27 +65,28 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
+ opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int32 table[][4] /* I */
);
void silk_NSQ_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
)
{
opus_int k, lag, start_idx, LSF_interpolation_flag;
@@ -101,8 +102,41 @@ void silk_NSQ_sse4_1(
opus_int32 tmp1;
opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
+#ifdef OPUS_CHECK_ASM
+ silk_nsq_state NSQ_c;
+ SideInfoIndices psIndices_c;
+ opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+ const opus_int8 *const pulses_a = pulses;
+#endif
+
SAVE_STACK;
+#ifdef OPUS_CHECK_ASM
+ ( void )pulses_a;
+ silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+ silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+ silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+ silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+
+ silk_NSQ_c(
+ psEncC,
+ &NSQ_c,
+ &psIndices_c,
+ x16,
+ pulses_c,
+ PredCoef_Q12,
+ LTPCoef_Q14,
+ AR_Q13,
+ HarmShapeGain_Q14,
+ Tilt_Q14,
+ LF_shp_Q14,
+ Gains_Q16,
+ pitchL,
+ Lambda_Q10,
+ LTP_scale_Q14
+ );
+#endif
+
NSQ->rand_seed = psIndices->Seed;
/* Set unvoiced lag to the previous one, overwrite later for voiced */
@@ -172,8 +206,7 @@ void silk_NSQ_sse4_1(
LSF_interpolation_flag = 1;
}
- ALLOC( sLTP_Q15,
- psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+ ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
/* Set up pointers to start of sub frame */
@@ -183,7 +216,7 @@ void silk_NSQ_sse4_1(
for( k = 0; k < psEncC->nb_subfr; k++ ) {
A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
- AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
+ AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
/* Noise shape parameters */
silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@@ -209,12 +242,12 @@ void silk_NSQ_sse4_1(
}
}
- silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
+ silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
{
silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
- AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
+ AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
offset_Q10, psEncC->subfr_length, &(table[32]) );
}
else
@@ -224,7 +257,7 @@ void silk_NSQ_sse4_1(
offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
}
- x_Q3 += psEncC->subfr_length;
+ x16 += psEncC->subfr_length;
pulses += psEncC->subfr_length;
pxq += psEncC->subfr_length;
}
@@ -235,12 +268,19 @@ void silk_NSQ_sse4_1(
/* Save quantized speech and noise shaping signals */
silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+ silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+ silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+ silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
RESTORE_STACK;
}
-/***********************************/
-/* silk_noise_shape_quantizer_10_16 */
-/***********************************/
+/************************************/
+/* silk_noise_shape_quantizer_10_16 */
+/************************************/
static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
silk_nsq_state *NSQ, /* I/O NSQ state */
opus_int signalType, /* I Signal type */
@@ -256,6 +296,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
opus_int Tilt_Q14, /* I Spectral tilt */
opus_int32 LF_shp_Q14, /* I */
opus_int32 Gain_Q16, /* I */
+ opus_int Lambda_Q10, /* I */
opus_int offset_Q10, /* I */
opus_int length, /* I Input length */
opus_int32 table[][4] /* I */
@@ -264,7 +305,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
opus_int i;
opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
- opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+ opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
@@ -279,6 +320,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
__m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
__m128i AR_shp_Q13_76543210;
+ int rdo_offset = (Lambda_Q10 >> 1) - 512;
+
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
@@ -288,6 +331,7 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
xq_Q14 = psLPC_Q14[ 0 ];
+ sDiff_shp_Q14 = NSQ->sDiff_shp_Q14;
LTP_pred_Q13 = 0;
/* load a_Q12 */
@@ -430,8 +474,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
- sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
- sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 );
+ sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
+ sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14), 0 );
/* high part, use pmaddwd, results in 4 32-bit */
xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
@@ -462,14 +506,14 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
- silk_assert( lag > 0 || signalType != TYPE_VOICED );
+ celt_assert( lag > 0 || signalType != TYPE_VOICED );
/* Combine prediction and noise shaping signals */
tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */
tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */
if( lag > 0 ) {
/* Symmetric, packed FIR coefficients */
- n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+ n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
shp_lag_ptr++;
@@ -495,6 +539,18 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
/* Find two quantization level candidates and measure their rate-distortion */
q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+ if (Lambda_Q10 > 2048) {
+ /* For aggressive RDO, the bias becomes more than one pulse. */
+ if (q1_Q10 > rdo_offset) {
+ q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+ } else if (q1_Q10 < -rdo_offset) {
+ q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+ } else if (q1_Q10 < 0) {
+ q1_Q0 = -1;
+ } else {
+ q1_Q0 = 0;
+ }
+ }
q1_Q10 = table[q1_Q0][0];
q2_Q10 = table[q1_Q0][1];
@@ -519,7 +575,8 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
/* Update states */
psLPC_Q14++;
*psLPC_Q14 = xq_Q14;
- sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
+ NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
+ sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
@@ -600,64 +657,54 @@ static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
}
static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- const opus_int32 x_Q3[], /* I input in Q3 */
- opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
- const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
- opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
- opus_int subfr, /* I subframe number */
- const opus_int LTP_scale_Q14, /* I */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
- const opus_int signal_type /* I Signal type */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ const opus_int16 x16[], /* I input */
+ opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
+ const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
+ opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
+ opus_int subfr, /* I subframe number */
+ const opus_int LTP_scale_Q14, /* I */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
+ const opus_int signal_type /* I Signal type */
)
{
opus_int i, lag;
- opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
- __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+ opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
+ __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
lag = pitchL[ subfr ];
inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
silk_assert( inv_gain_Q31 != 0 );
- /* Calculate gain adjustment factor */
- if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
- gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
- } else {
- gain_adj_Q16 = (opus_int32)1 << 16;
- }
-
/* Scale input */
- inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+ inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
- /* prepare inv_gain_Q23 in packed 4 32-bits */
- xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+ /* prepare inv_gain_Q26 in packed 4 32-bits */
+ xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
- xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+ xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
/* equal shift right 4 bytes*/
- xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+ xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
- xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
- xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+ xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+ xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
- xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
- xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+ xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+ xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
- xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+ xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
- _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
+ _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
}
for( ; i < psEncC->subfr_length; i++ ) {
- x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+ x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
}
- /* Save inverse gain */
- NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
/* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
if( NSQ->rewhite_flag ) {
if( subfr == 0 ) {
@@ -671,9 +718,11 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
}
/* Adjust for changing gain */
- if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
- /* Scale long-term shaping state */
+ if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
__m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
+ gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
+ /* Scale long-term shaping state */
/* prepare gain_adj_Q16 in packed 4 32-bits */
xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
@@ -707,6 +756,7 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
}
NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
+ NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );
/* Scale short-term prediction and shaping states */
for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@@ -715,5 +765,8 @@ static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
}
+
+ /* Save inverse gain */
+ NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
}
}
diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h
index 61efa8da..89a5ec88 100644
--- a/silk/x86/SigProc_FIX_sse.h
+++ b/silk/x86/SigProc_FIX_sse.h
@@ -26,13 +26,13 @@
*/
#ifndef SIGPROC_FIX_SSE_H
-#define SIGPROC_FIX_SSE_H
+# define SIGPROC_FIX_SSE_H
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
+# ifdef HAVE_CONFIG_H
+# include "config.h"
+# endif
-#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
void silk_burg_modified_sse4_1(
opus_int32 *res_nrg, /* O Residual energy */
opus_int *res_nrg_Q, /* O Residual energy Q value */
@@ -45,11 +45,13 @@ void silk_burg_modified_sse4_1(
int arch /* I Run-time architecture */
);
-#if defined(OPUS_X86_PRESUME_SSE4_1)
-#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
- ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+# if defined(OPUS_X86_PRESUME_SSE4_1)
+
+# define OVERRIDE_silk_burg_modified
+# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+ ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
-#else
+# elif defined(OPUS_HAVE_RTCD)
extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
opus_int32 *res_nrg, /* O Residual energy */
@@ -62,33 +64,36 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
const opus_int D, /* I Order */
int arch /* I Run-time architecture */);
-# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
- ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+# define OVERRIDE_silk_burg_modified
+# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+ ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
-#endif
+# endif
-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len
);
-#if defined(OPUS_X86_PRESUME_SSE4_1)
+# if defined(OPUS_X86_PRESUME_SSE4_1)
-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
- ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
+# define OVERRIDE_silk_inner_prod16
+# define silk_inner_prod16(inVec1, inVec2, len, arch) \
+ ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))
-#else
+# elif defined(OPUS_HAVE_RTCD)
-extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
+extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len);
-# define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
- ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
+# define OVERRIDE_silk_inner_prod16
+# define silk_inner_prod16(inVec1, inVec2, len, arch) \
+ ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
-#endif
-#endif
+# endif
+# endif
#endif
diff --git a/silk/x86/VAD_sse4_1.c b/silk/x86/VAD_sse4_1.c
index d02ddf4a..e7eaf971 100644
--- a/silk/x86/VAD_sse4_1.c
+++ b/silk/x86/VAD_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -63,6 +63,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
SAVE_STACK;
+#ifdef OPUS_CHECK_ASM
+ silk_encoder_state psEncC_c;
+ opus_int ret_c;
+
+ silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
+ ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
+#endif
+
/* Safety checks */
silk_assert( VAD_N_BANDS == 4 );
celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
@@ -233,15 +241,14 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
}
+ if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
+ speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
+ }
/* Power scaling */
if( speech_nrg <= 0 ) {
SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
- } else if( speech_nrg < 32768 ) {
- if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
- speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
- } else {
- speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
- }
+ } else if( speech_nrg < 16384 ) {
+ speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
/* square-root */
speech_nrg = silk_SQRT_APPROX( speech_nrg );
@@ -272,6 +279,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( /* O Return value, 0 if s
psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
}
+#ifdef OPUS_CHECK_ASM
+ silk_assert( ret == ret_c );
+ silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
+#endif
+
RESTORE_STACK;
return( ret );
}
diff --git a/silk/x86/VQ_WMat_EC_sse4_1.c b/silk/x86/VQ_WMat_EC_sse4_1.c
index 74d6c6d0..2c7d18d0 100644
--- a/silk/x86/VQ_WMat_EC_sse4_1.c
+++ b/silk/x86/VQ_WMat_EC_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
- Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -38,105 +38,136 @@
/* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
void silk_VQ_WMat_EC_sse4_1(
opus_int8 *ind, /* O index of best codebook vector */
- opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
+ opus_int32 *res_nrg_Q15, /* O best residual energy */
+ opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
- const opus_int16 *in_Q14, /* I input vector to be quantized */
- const opus_int32 *W_Q18, /* I weighting matrix */
+ const opus_int32 *XX_Q17, /* I correlation matrix */
+ const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
- const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
+ const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
- opus_int L /* I number of vectors in codebook */
+ const opus_int L /* I number of vectors in codebook */
)
{
opus_int k, gain_tmp_Q7;
const opus_int8 *cb_row_Q7;
- opus_int16 diff_Q14[ 5 ];
- opus_int32 sum1_Q14, sum2_Q16;
+ opus_int32 neg_xX_Q24[ 5 ];
+ opus_int32 sum1_Q15, sum2_Q24;
+ opus_int32 bits_res_Q8, bits_tot_Q8;
+ __m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24;
+
+ /* Negate and convert to new Q domain */
+ neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 );
+ neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 );
+ neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 );
+ neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 );
+ neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 );
+
+ v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) );
+ v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) );
- __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
/* Loop over codebook */
- *rate_dist_Q14 = silk_int32_MAX;
+ *rate_dist_Q8 = silk_int32_MAX;
+ *res_nrg_Q15 = silk_int32_MAX;
cb_row_Q7 = cb_Q7;
+ /* If things go really bad, at least *ind is set to something safe. */
+ *ind = 0;
for( k = 0; k < L; k++ ) {
+ opus_int32 penalty;
gain_tmp_Q7 = cb_gain_Q7[k];
-
- diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
-
- C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
- C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
- C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
- C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
-
- diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
- diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
- diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
- diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
-
/* Weighted rate */
- sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
+ /* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */
+ sum1_Q15 = SILK_FIX_CONST( 1.001, 15 );
/* Penalty for too large gain */
- sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
-
- silk_assert( sum1_Q14 >= 0 );
-
- /* first row of W_Q18 */
- C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
- C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
- C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
-
- C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
- C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
-
- C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
- C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
-
- C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
- C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
-
- C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
- sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
-
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] );
- sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] );
-
- /* second row of W_Q18 */
- sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] );
- sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] );
- sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] );
-
- /* third row of W_Q18 */
- sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
- sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
- sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] );
-
- /* fourth row of W_Q18 */
- sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] );
- sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
- sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
- sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] );
-
- /* last row of W_Q18 */
- sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] );
- sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] );
-
- silk_assert( sum1_Q14 >= 0 );
+ penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 );
+
+ /* first row of XX_Q17 */
+ v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
+ v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+ v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 );
+ v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 );
+ v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7);
+ v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) );
+ v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24);
+ sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 );
+ sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 );
+ sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 0 ], cb_row_Q7[ 0 ] );
+ sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 0 ] );
+
+ /* second row of XX_Q17 */
+ sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[ 7 ], cb_row_Q7[ 2 ] );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 8 ], cb_row_Q7[ 3 ] );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 9 ], cb_row_Q7[ 4 ] );
+ sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 6 ], cb_row_Q7[ 1 ] );
+ sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 1 ] );
+
+ /* third row of XX_Q17 */
+ sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 14 ], cb_row_Q7[ 4 ] );
+ sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 12 ], cb_row_Q7[ 2 ] );
+ sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 2 ] );
+
+ /* fourth row of XX_Q17 */
+ sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] );
+ sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 18 ], cb_row_Q7[ 3 ] );
+ sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 3 ] );
+
+ /* last row of XX_Q17 */
+ sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 );
+ sum2_Q24 = silk_MLA( sum2_Q24, XX_Q17[ 24 ], cb_row_Q7[ 4 ] );
+ sum1_Q15 = silk_SMLAWB( sum1_Q15, sum2_Q24, cb_row_Q7[ 4 ] );
/* find best */
- if( sum1_Q14 < *rate_dist_Q14 ) {
- *rate_dist_Q14 = sum1_Q14;
- *ind = (opus_int8)k;
- *gain_Q7 = gain_tmp_Q7;
+ if( sum1_Q15 >= 0 ) {
+ /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
+ bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
+ /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
+ bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
+ if( bits_tot_Q8 <= *rate_dist_Q8 ) {
+ *rate_dist_Q8 = bits_tot_Q8;
+ *res_nrg_Q15 = sum1_Q15 + penalty;
+ *ind = (opus_int8)k;
+ *gain_Q7 = gain_tmp_Q7;
+ }
}
/* Go to next cbk vector */
cb_row_Q7 += LTP_ORDER;
}
+
+#ifdef OPUS_CHECK_ASM
+ {
+ opus_int8 ind_c = 0;
+ opus_int32 res_nrg_Q15_c = 0;
+ opus_int32 rate_dist_Q8_c = 0;
+ opus_int gain_Q7_c = 0;
+
+ silk_VQ_WMat_EC_c(
+ &ind_c,
+ &res_nrg_Q15_c,
+ &rate_dist_Q8_c,
+ &gain_Q7_c,
+ XX_Q17,
+ xX_Q17,
+ cb_Q7,
+ cb_gain_Q7,
+ cl_Q5,
+ subfr_len,
+ max_gain_Q7,
+ L
+ );
+
+ silk_assert( *ind == ind_c );
+ silk_assert( *res_nrg_Q15 == res_nrg_Q15_c );
+ silk_assert( *rate_dist_Q8 == rate_dist_Q8_c );
+ silk_assert( *gain_Q7 == gain_Q7_c );
+ }
+#endif
}
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
index 2f15d448..a01d7f6c 100644
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -26,171 +26,169 @@
*/
#ifndef MAIN_SSE_H
-#define MAIN_SSE_H
+# define MAIN_SSE_H
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
+# ifdef HAVE_CONFIG_H
+# include "config.h"
+# endif
# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
-# define OVERRIDE_silk_VQ_WMat_EC
-
void silk_VQ_WMat_EC_sse4_1(
opus_int8 *ind, /* O index of best codebook vector */
- opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
+ opus_int32 *res_nrg_Q15, /* O best residual energy */
+ opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
- const opus_int16 *in_Q14, /* I input vector to be quantized */
- const opus_int32 *W_Q18, /* I weighting matrix */
+ const opus_int32 *XX_Q17, /* I correlation matrix */
+ const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
- const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
+ const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
- opus_int L /* I number of vectors in codebook */
+ const opus_int L /* I number of vectors in codebook */
);
-#if defined OPUS_X86_PRESUME_SSE4_1
+# if defined OPUS_X86_PRESUME_SSE4_1
-#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
- mu_Q9, max_gain_Q7, L, arch) \
- ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
- mu_Q9, max_gain_Q7, L))
+# define OVERRIDE_silk_VQ_WMat_EC
+# define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+ subfr_len, max_gain_Q7, L, arch) \
+ ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+ subfr_len, max_gain_Q7, L))
-#else
+# elif defined(OPUS_HAVE_RTCD)
extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
opus_int8 *ind, /* O index of best codebook vector */
- opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
+ opus_int32 *res_nrg_Q15, /* O best residual energy */
+ opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
- const opus_int16 *in_Q14, /* I input vector to be quantized */
- const opus_int32 *W_Q18, /* I weighting matrix */
+ const opus_int32 *XX_Q17, /* I correlation matrix */
+ const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
- const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
+ const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
- opus_int L /* I number of vectors in codebook */
+ const opus_int L /* I number of vectors in codebook */
);
-# define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
- mu_Q9, max_gain_Q7, L, arch) \
- ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
- mu_Q9, max_gain_Q7, L))
+# define OVERRIDE_silk_VQ_WMat_EC
+# define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+ subfr_len, max_gain_Q7, L, arch) \
+ ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+ subfr_len, max_gain_Q7, L))
-#endif
-#endif
-
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
-# define OVERRIDE_silk_NSQ
+# endif
void silk_NSQ_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
-#if defined OPUS_X86_PRESUME_SSE4_1
+# if defined OPUS_X86_PRESUME_SSE4_1
-#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
- HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+# define OVERRIDE_silk_NSQ
+# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
-#else
+# elif defined(OPUS_HAVE_RTCD)
extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
-# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
- HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+# define OVERRIDE_silk_NSQ
+# define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
-#endif
-
-# define OVERRIDE_silk_NSQ_del_dec
+# endif
void silk_NSQ_del_dec_sse4_1(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
-#if defined OPUS_X86_PRESUME_SSE4_1
+# if defined OPUS_X86_PRESUME_SSE4_1
-#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
- HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
- ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+# define OVERRIDE_silk_NSQ_del_dec
+# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+ ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
-#else
+# elif defined(OPUS_HAVE_RTCD)
extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
-# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
- HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
- ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+# define OVERRIDE_silk_NSQ_del_dec
+# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+ ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
-#endif
-#endif
+# endif
void silk_noise_shape_quantizer(
silk_nsq_state *NSQ, /* I/O NSQ state */
@@ -223,26 +221,27 @@ void silk_VAD_GetNoiseLevels(
silk_VAD_state *psSilk_VAD /* I/O Pointer to Silk VAD state */
);
-# define OVERRIDE_silk_VAD_GetSA_Q8
-
opus_int silk_VAD_GetSA_Q8_sse4_1(
silk_encoder_state *psEnC,
const opus_int16 pIn[]
);
-#if defined(OPUS_X86_PRESUME_SSE4_1)
-#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
+# if defined(OPUS_X86_PRESUME_SSE4_1)
-#else
+# define OVERRIDE_silk_VAD_GetSA_Q8
+# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
-# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
- ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
+# elif defined(OPUS_HAVE_RTCD)
extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
silk_encoder_state *psEnC,
const opus_int16 pIn[]);
-#endif
+# define OVERRIDE_silk_VAD_GetSA_Q8
+# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
+ ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
+
+# endif
# endif
#endif
diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c
index 32dcc3ca..70f60078 100644
--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@@ -35,22 +35,22 @@
#include "pitch.h"
#include "main.h"
-#if !defined(OPUS_X86_PRESUME_SSE4_1)
+#if defined(OPUS_HAVE_RTCD) && !defined(OPUS_X86_PRESUME_SSE4_1)
#if defined(FIXED_POINT)
#include "fixed/main_FIX.h"
-opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
+opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len
) = {
- silk_inner_prod16_aligned_64_c, /* non-sse */
- silk_inner_prod16_aligned_64_c,
- silk_inner_prod16_aligned_64_c,
- MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
- MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ) /* avx */
+ silk_inner_prod16_c, /* non-sse */
+ silk_inner_prod16_c,
+ silk_inner_prod16_c,
+ MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */
+ MAY_HAVE_SSE4_1( silk_inner_prod16 ) /* avx */
};
#endif
@@ -66,23 +66,22 @@ opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ) /* avx */
};
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
) = {
silk_NSQ_c, /* non-sse */
silk_NSQ_c,
@@ -90,21 +89,20 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_NSQ ) /* avx */
};
-#endif
-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
opus_int8 *ind, /* O index of best codebook vector */
- opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
+ opus_int32 *res_nrg_Q15, /* O best residual energy */
+ opus_int32 *rate_dist_Q8, /* O best total bitrate */
opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
- const opus_int16 *in_Q14, /* I input vector to be quantized */
- const opus_int32 *W_Q18, /* I weighting matrix */
+ const opus_int32 *XX_Q17, /* I correlation matrix */
+ const opus_int32 *xX_Q17, /* I correlation vector */
const opus_int8 *cb_Q7, /* I codebook */
const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
- const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
+ const opus_int subfr_len, /* I number of samples per subframe */
const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
- opus_int L /* I number of vectors in codebook */
+ const opus_int L /* I number of vectors in codebook */
) = {
silk_VQ_WMat_EC_c, /* non-sse */
silk_VQ_WMat_EC_c,
@@ -112,25 +110,23 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ) /* avx */
};
-#endif
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
- const silk_encoder_state *psEncC, /* I Encoder State */
- silk_nsq_state *NSQ, /* I/O NSQ state */
- SideInfoIndices *psIndices, /* I/O Quantization Indices */
- const opus_int32 x_Q3[], /* I Prefiltered input signal */
- opus_int8 pulses[], /* O Quantized pulse signal */
- const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
- const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
- const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
- const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
- const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
- const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
- const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
- const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
- const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
- const opus_int LTP_scale_Q14 /* I LTP state scaling */
+ const silk_encoder_state *psEncC, /* I Encoder State */
+ silk_nsq_state *NSQ, /* I/O NSQ state */
+ SideInfoIndices *psIndices, /* I/O Quantization Indices */
+ const opus_int16 x16[], /* I Input */
+ opus_int8 pulses[], /* O Quantized pulse signal */
+ const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
+ const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
+ const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
+ const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
+ const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
+ const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
+ const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
+ const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
+ const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
+ const opus_int LTP_scale_Q14 /* I LTP state scaling */
) = {
silk_NSQ_del_dec_c, /* non-sse */
silk_NSQ_del_dec_c,
@@ -138,7 +134,6 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ) /* avx */
};
-#endif
#if defined(FIXED_POINT)
diff --git a/silk_sources.mk b/silk_sources.mk
index d2666e66..3df24816 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -77,15 +77,19 @@ silk/stereo_find_predictor.c \
silk/stereo_quant_pred.c \
silk/LPC_fit.c
-SILK_SOURCES_SSE4_1 = \
+SILK_SOURCES_X86_RTCD = \
+silk/x86/x86_silk_map.c
+
+SILK_SOURCES_SSE4_1 = \
silk/x86/NSQ_sse4_1.c \
silk/x86/NSQ_del_dec_sse4_1.c \
-silk/x86/x86_silk_map.c \
silk/x86/VAD_sse4_1.c \
silk/x86/VQ_WMat_EC_sse4_1.c
+SILK_SOURCES_ARM_RTCD = \
+silk/arm/arm_silk_map.c
+
SILK_SOURCES_ARM_NEON_INTR = \
-silk/arm/arm_silk_map.c \
silk/arm/biquad_alt_neon_intr.c \
silk/arm/LPC_inv_pred_gain_neon_intr.c \
silk/arm/NSQ_del_dec_neon_intr.c \
diff --git a/src/opus_decoder.c b/src/opus_decoder.c
index 9113638a..6520e748 100644
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -278,7 +278,8 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
ec_dec_init(&dec,(unsigned char*)data,len);
} else {
audiosize = frame_size;
- mode = st->prev_mode;
+ /* Run PLC using last used mode (CELT if we ended with CELT redundancy) */
+ mode = st->prev_redundancy ? MODE_CELT_ONLY : st->prev_mode;
bandwidth = 0;
if (mode == 0)
@@ -419,7 +420,7 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
start_band = 0;
if (!decode_fec && mode != MODE_CELT_ONLY && data != NULL
- && ec_tell(&dec)+17+20*(st->mode == MODE_HYBRID) <= 8*len)
+ && ec_tell(&dec)+17+20*(mode == MODE_HYBRID) <= 8*len)
{
/* Check if we have a redundant 0-8 kHz band */
if (mode == MODE_HYBRID)
@@ -499,6 +500,11 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
/* 5 ms redundant frame for CELT->SILK*/
if (redundancy && celt_to_silk)
{
+ /* If the previous frame did not use CELT (the first redundancy frame in
+ a transition from SILK may have been lost) then the CELT decoder is
+ stale at this point and the redundancy audio is not useful, however
+ the final range is still needed (for testing), so the redundancy is
+ always decoded but the decoded audio may not be used */
MUST_SUCCEED(celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0)));
celt_decode_with_ec(celt_dec, data+len, redundancy_bytes,
redundant_audio, F5, NULL, 0);
@@ -561,7 +567,10 @@ static int opus_decode_frame(OpusDecoder *st, const unsigned char *data,
smooth_fade(pcm+st->channels*(frame_size-F2_5), redundant_audio+st->channels*F2_5,
pcm+st->channels*(frame_size-F2_5), F2_5, st->channels, window, st->Fs);
}
- if (redundancy && celt_to_silk)
+ /* 5ms redundant frame for CELT->SILK; ignore if the previous frame did not
+ use CELT (the first redundancy frame in a transition from SILK may have
+ been lost) */
+ if (redundancy && celt_to_silk && (st->prev_mode != MODE_SILK_ONLY || st->prev_redundancy))
{
for (c=0;c<st->channels;c++)
{
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 7b5f0abf..8c8db5a5 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -87,6 +87,7 @@ struct OpusEncoder {
int lfe;
int arch;
int use_dtx; /* general DTX for both SILK and CELT */
+ int fec_config;
#ifndef DISABLE_FLOAT_API
TonalityAnalysisState analysis;
#endif
@@ -112,7 +113,7 @@ struct OpusEncoder {
opus_val16 delay_buffer[MAX_ENCODER_BUFFER*2];
#ifndef DISABLE_FLOAT_API
int detected_bandwidth;
- int nb_no_activity_frames;
+ int nb_no_activity_ms_Q1;
opus_val32 peak_signal_energy;
#endif
int nonfinal_frame; /* current frame is not the final in a packet */
@@ -893,24 +894,28 @@ static opus_val32 compute_frame_energy(const opus_val16 *pcm, int frame_size, in
/* Decides if DTX should be turned on (=1) or off (=0) */
static int decide_dtx_mode(opus_int activity, /* indicates if this frame contains speech/music */
- int *nb_no_activity_frames /* number of consecutive frames with no activity */
+ int *nb_no_activity_ms_Q1, /* number of consecutive milliseconds with no activity, in Q1 */
+ int frame_size_ms_Q1 /* number of miliseconds in this update, in Q1 */
)
{
if (!activity)
{
- /* The number of consecutive DTX frames should be within the allowed bounds */
- (*nb_no_activity_frames)++;
- if (*nb_no_activity_frames > NB_SPEECH_FRAMES_BEFORE_DTX)
+ /* The number of consecutive DTX frames should be within the allowed bounds.
+ Note that the allowed bound is defined in the SILK headers and assumes 20 ms
+ frames. As this function can be called with any frame length, a conversion to
+ milliseconds is done before the comparisons. */
+ (*nb_no_activity_ms_Q1) += frame_size_ms_Q1;
+ if (*nb_no_activity_ms_Q1 > NB_SPEECH_FRAMES_BEFORE_DTX*20*2)
{
- if (*nb_no_activity_frames <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX))
+ if (*nb_no_activity_ms_Q1 <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX)*20*2)
/* Valid frame for DTX! */
return 1;
else
- (*nb_no_activity_frames) = NB_SPEECH_FRAMES_BEFORE_DTX;
+ (*nb_no_activity_ms_Q1) = NB_SPEECH_FRAMES_BEFORE_DTX*20*2;
}
} else
- (*nb_no_activity_frames) = 0;
+ (*nb_no_activity_ms_Q1) = 0;
return 0;
}
@@ -1310,6 +1315,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
st->stream_channels = st->force_channels;
} else {
#ifdef FUZZING
+ (void)stereo_music_threshold;
+ (void)stereo_voice_threshold;
/* Random mono/stereo decision */
if (st->channels == 2 && (rand()&0x1F)==0)
st->stream_channels = 3-st->stream_channels;
@@ -1348,6 +1355,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
} else if (st->user_forced_mode == OPUS_AUTO)
{
#ifdef FUZZING
+ (void)stereo_width;
+ (void)mode_thresholds;
/* Random mode switching */
if ((rand()&0xF)==0)
{
@@ -1385,8 +1394,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
st->mode = (equiv_rate >= threshold) ? MODE_CELT_ONLY: MODE_SILK_ONLY;
- /* When FEC is enabled and there's enough packet loss, use SILK */
- if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4)
+ /* When FEC is enabled and there's enough packet loss, use SILK.
+ Unless the FEC is set to 2, in which case we don't switch to SILK if we're confident we have music. */
+ if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4 && (st->fec_config != 2 || voice_est > 25))
st->mode = MODE_SILK_ONLY;
/* When encoding voice and DTX is enabled but the generalized DTX cannot be used,
use SILK in order to make use of its DTX. */
@@ -2132,7 +2142,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
#ifndef DISABLE_FLOAT_API
if (st->use_dtx && (analysis_info.valid || is_silence))
{
- if (decide_dtx_mode(activity, &st->nb_no_activity_frames))
+ if (decide_dtx_mode(activity, &st->nb_no_activity_ms_Q1, 2*1000*frame_size/st->Fs))
{
st->rangeFinal = 0;
data[0] = gen_toc(st->mode, st->Fs/frame_size, curr_bandwidth, st->stream_channels);
@@ -2140,7 +2150,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
return 1;
}
} else {
- st->nb_no_activity_frames = 0;
+ st->nb_no_activity_ms_Q1 = 0;
}
#endif
@@ -2435,11 +2445,12 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
case OPUS_SET_INBAND_FEC_REQUEST:
{
opus_int32 value = va_arg(ap, opus_int32);
- if(value<0 || value>1)
+ if(value<0 || value>2)
{
goto bad_arg;
}
- st->silk_mode.useInBandFEC = value;
+ st->fec_config = value;
+ st->silk_mode.useInBandFEC = (value != 0);
}
break;
case OPUS_GET_INBAND_FEC_REQUEST:
@@ -2449,7 +2460,7 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
{
goto bad_arg;
}
- *value = st->silk_mode.useInBandFEC;
+ *value = st->fec_config;
}
break;
case OPUS_SET_PACKET_LOSS_PERC_REQUEST:
@@ -2733,7 +2744,7 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...)
#ifndef DISABLE_FLOAT_API
else if (st->use_dtx) {
/* DTX determined by Opus. */
- *value = st->nb_no_activity_frames >= NB_SPEECH_FRAMES_BEFORE_DTX;
+ *value = st->nb_no_activity_ms_Q1 >= NB_SPEECH_FRAMES_BEFORE_DTX*20*2;
}
#endif
else {
diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index 93204a14..213e3eb2 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -443,7 +443,8 @@ static int opus_multistream_encoder_init_impl(
char *ptr;
if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
- (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
+ (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) ||
+ (streams+coupled_streams>channels))
return OPUS_BAD_ARG;
st->arch = opus_select_arch();
@@ -459,8 +460,7 @@ static int opus_multistream_encoder_init_impl(
st->layout.mapping[i] = mapping[i];
if (!validate_layout(&st->layout))
return OPUS_BAD_ARG;
- if (mapping_type == MAPPING_TYPE_SURROUND &&
- !validate_encoder_layout(&st->layout))
+ if (!validate_encoder_layout(&st->layout))
return OPUS_BAD_ARG;
if (mapping_type == MAPPING_TYPE_AMBISONICS &&
!validate_ambisonics(st->layout.nb_channels, NULL, NULL))
@@ -595,7 +595,8 @@ OpusMSEncoder *opus_multistream_encoder_create(
int ret;
OpusMSEncoder *st;
if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
- (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
+ (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) ||
+ (streams+coupled_streams>channels))
{
if (error)
*error = OPUS_BAD_ARG;
diff --git a/tests/opus_build_test.sh b/tests/opus_build_test.sh
new file mode 100755
index 00000000..573f4473
--- /dev/null
+++ b/tests/opus_build_test.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+tarball=`realpath "$1"`
+nb_tests="$2"
+oldvectors=`realpath "$3"`
+newvectors=`realpath "$4"`
+base=`basename "$tarball" .tar.gz`
+
+tar xvf "$tarball" > /dev/null 2>&1
+cd "$base"
+
+if [ $? -ne 0 ]
+then
+ echo cannot go to "$base"
+ exit 1
+fi
+
+mkdir build_tests
+
+configure_dir=`pwd`
+seq -w "$nb_tests" | parallel --halt now,fail=10 -j +2 -q ../random_config.sh "build_tests/run_{}" "$configure_dir" "$oldvectors" "$newvectors"
+
+if [ $? -ne 0 ]
+then
+ echo Check found errors
+ exit 1
+else
+ echo No error found
+fi
diff --git a/tests/opus_encode_regressions.c b/tests/opus_encode_regressions.c
index 29234730..4d506eb6 100644
--- a/tests/opus_encode_regressions.c
+++ b/tests/opus_encode_regressions.c
@@ -35,7 +35,6 @@
#include <stdint.h>
#include <math.h>
#include <string.h>
-#include <assert.h>
#include "opus_multistream.h"
#include "opus.h"
#include "test_opus_common.h"
@@ -106,7 +105,7 @@ static int celt_ec_internal_error(void)
1799, 1799, 1799, 1799, -9721
};
err = opus_multistream_encode(enc, pcm, 320, data, 2460);
- assert(err > 0);
+ opus_test_assert(err > 0);
}
opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -144,7 +143,7 @@ static int celt_ec_internal_error(void)
-9510, -9510, -9510, -9510, -9510, -9510, -9510
};
err = opus_multistream_encode(enc, pcm, 160, data, 2460);
- assert(err > 0);
+ opus_test_assert(err > 0);
}
opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -182,7 +181,7 @@ static int celt_ec_internal_error(void)
-9510, -9510, -9510, -9510, -9510, -9510, -9510
};
err = opus_multistream_encode(enc, pcm, 160, data, 2460);
- assert(err > 0);
+ opus_test_assert(err > 0);
}
opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -220,7 +219,7 @@ static int celt_ec_internal_error(void)
-9510, -9510, -9510, -9510, -9510, -9510, -9510
};
err = opus_multistream_encode(enc, pcm, 160, data, 2460);
- assert(err > 0);
+ opus_test_assert(err > 0);
}
opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -256,7 +255,7 @@ static int celt_ec_internal_error(void)
5632
};
err = opus_multistream_encode(enc, pcm, 160, data, 2460);
- assert(err > 0);
+ opus_test_assert(err > 0);
}
opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_VOICE));
opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -281,7 +280,7 @@ static int celt_ec_internal_error(void)
0, 0, -256, 226
};
err = opus_multistream_encode(enc, pcm, 40, data, 2460);
- assert(err > 0);
+ opus_test_assert(err > 0);
/* returns -3 */
}
opus_multistream_encoder_destroy(enc);
@@ -334,7 +333,7 @@ static int mscbr_encode_fail10(void)
0
};
err = opus_multistream_encode(enc, pcm, 20, data, 627300);
- assert(err > 0);
+ opus_test_assert(err > 0);
/* returns -1 */
}
opus_multistream_encoder_destroy(enc);
@@ -384,7 +383,7 @@ static int mscbr_encode_fail(void)
0
};
err = opus_multistream_encode(enc, pcm, 20, data, 472320);
- assert(err > 0);
+ opus_test_assert(err > 0);
/* returns -1 */
}
opus_multistream_encoder_destroy(enc);
@@ -740,7 +739,7 @@ static int surround_analysis_uninit(void)
-20992, 25859, 5372, 12040, 13307, -4355,-30213, -9, -6019
};
err = opus_multistream_encode(enc, pcm, 960, data, 7380);
- assert(err > 0);
+ opus_test_assert(err > 0);
}
opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(1));
@@ -885,7 +884,7 @@ static int surround_analysis_uninit(void)
};
err = opus_multistream_encode(enc, pcm, 1440, data, 7380);
/* reads uninitialized data at src/opus_multistream_encoder.c:293 */
- assert(err > 0);
+ opus_test_assert(err > 0);
}
opus_multistream_encoder_destroy(enc);
return 0;
@@ -935,7 +934,7 @@ static int ec_enc_shrink_assert(void)
opus_encoder_ctl(enc, OPUS_SET_PACKET_LOSS_PERC(6));
opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000));
data_len = opus_encode(enc, pcm1, 960, data, 2000);
- assert(data_len > 0);
+ opus_test_assert(data_len > 0);
opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_VOICE));
opus_encoder_ctl(enc, OPUS_SET_PREDICTION_DISABLED(1));
@@ -943,12 +942,12 @@ static int ec_enc_shrink_assert(void)
opus_encoder_ctl(enc, OPUS_SET_INBAND_FEC(1));
opus_encoder_ctl(enc, OPUS_SET_BITRATE(15600));
data_len = opus_encode(enc, pcm2, 2880, data, 122);
- assert(data_len > 0);
+ opus_test_assert(data_len > 0);
opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
opus_encoder_ctl(enc, OPUS_SET_BITRATE(27000));
data_len = opus_encode(enc, pcm3, 2880, data, 122); /* assertion failure */
- assert(data_len > 0);
+ opus_test_assert(data_len > 0);
opus_encoder_destroy(enc);
return 0;
@@ -970,7 +969,7 @@ static int ec_enc_shrink_assert2(void)
{
static const short pcm[960] = { 0 };
data_len = opus_encode(enc, pcm, 960, data, 2000);
- assert(data_len > 0);
+ opus_test_assert(data_len > 0);
}
opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
{
@@ -980,7 +979,7 @@ static int ec_enc_shrink_assert2(void)
-32768, -32768, 0, 0, -32768, -32768, 0, 0, -32768, -32768
};
data_len = opus_encode(enc, pcm, 480, data, 19);
- assert(data_len > 0);
+ opus_test_assert(data_len > 0);
}
opus_encoder_destroy(enc);
return 0;
@@ -1009,14 +1008,14 @@ static int silk_gain_assert(void)
opus_encoder_ctl(enc, OPUS_SET_MAX_BANDWIDTH(OPUS_BANDWIDTH_NARROWBAND));
opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000));
data_len = opus_encode(enc, pcm1, 160, data, 1000);
- assert(data_len > 0);
+ opus_test_assert(data_len > 0);
opus_encoder_ctl(enc, OPUS_SET_VBR(0));
opus_encoder_ctl(enc, OPUS_SET_COMPLEXITY(0));
opus_encoder_ctl(enc, OPUS_SET_MAX_BANDWIDTH(OPUS_BANDWIDTH_MEDIUMBAND));
opus_encoder_ctl(enc, OPUS_SET_BITRATE(2867));
data_len = opus_encode(enc, pcm2, 960, data, 1000);
- assert(data_len > 0);
+ opus_test_assert(data_len > 0);
opus_encoder_destroy(enc);
return 0;
diff --git a/tests/random_config.sh b/tests/random_config.sh
new file mode 100755
index 00000000..0cdd855f
--- /dev/null
+++ b/tests/random_config.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+dir="$1"
+mkdir "$dir"
+if [ $? -ne 0 ]
+then
+ exit 1
+fi
+
+cd "$dir"
+if [ $? -ne 0 ]
+then
+ exit 1
+fi
+
+
+configure_path="$2"
+config="random_config.txt"
+
+case `seq 3 | shuf -n1` in
+1)
+approx=--enable-float-approx
+math=-ffast-math
+;;
+2)
+approx=--enable-float-approx
+;;
+*)
+approx=
+math=
+;;
+esac
+
+CFLAGS='-g'
+
+opt=`echo -e "-O1\n-O2\n-O3" | shuf -n1`
+
+#arch=-march=`echo -e "core2\nsandybridge\nbroadwell\nskylake" | shuf -n1`
+arch=`echo -e "\n-march=core2\n-march=sandybridge\n-march=broadwell\n-march=skylake\n-march=native" | shuf -n1`
+
+footprint=`echo -e "\n-DSMALL_FOOTPRINT" | shuf -n1`
+std=`echo -e "\n-std=c90\n-std=c99\n-std=c11\n-std=c17" | shuf -n1`
+sanitize=`echo -e "\n-fsanitize=address -fno-sanitize-recover=all\n-fsanitize=undefined -fno-sanitize-recover=all -fsanitize-recover=signed-integer-overflow" | shuf -n1`
+
+
+CFLAGS="$CFLAGS $std $opt $arch $footprint $math $sanitize"
+
+echo "CFLAGS=$CFLAGS" > "$config"
+
+lib=`echo -e "\n--disable-static\n--disable-shared" | shuf -n1`
+
+arithmetic=`echo -e "\n--enable-fixed-point\n--enable-fixed-point --enable-fixed-point-debug\n--enable-fixed-point --disable-float-api\n--enable-fixed-point --enable-fixed-point-debug --disable-float-api" | shuf -n1`
+
+custom=`echo -e "\n--enable-custom-modes" | shuf -n1`
+
+asm=`echo -e "\n--disable-asm\n--disable-rtcd\n--disable-intrinsics" | shuf -n1`
+#asm=`echo -e "\n--disable-asm\n--disable-intrinsics" | shuf -n1`
+
+assert=`echo -e "\n--enable-assertions" | shuf -n1`
+harden=`echo -e "\n--enable-hardening" | shuf -n1`
+fuzz=`echo -e "\n--enable-fuzzing" | shuf -n1`
+checkasm=`echo -e "\n--enable-check-asm" | shuf -n1`
+rfc8251=`echo -e "\n--disable-rfc8251" | shuf -n1`
+
+if [ "$rfc8251" = --disable-rfc8251 ]
+then
+ vectors="$3"
+else
+ vectors="$4"
+fi
+echo using testvectors at "$vectors" >> "$config"
+
+
+config_opt="$lib $arithmetic $custom $asm $assert $harden $fuzz $checkasm $rfc8251 $approx"
+
+echo configure $config_opt >> "$config"
+
+export CFLAGS
+"$configure_path/configure" $config_opt > configure_output.txt 2>&1
+
+if [ $? -ne 0 ]
+then
+ echo configure FAIL >> "$config"
+ exit 1
+fi
+
+make > make_output.txt 2>&1
+
+if [ $? -ne 0 ]
+then
+ echo make FAIL >> "$config"
+ exit 1
+fi
+
+#Run valgrind 5% of the time (minus the asan cases)
+if [ "`seq 20 | shuf -n1`" -ne 1 -o "$sanitize" = "-fsanitize=address -fno-sanitize-recover=all" ]
+then
+ make check > makecheck_output.txt 2>&1
+else
+ echo valgrind enabled >> "$config"
+ valgrind --trace-children=yes --error-exitcode=128 make check > makecheck_output.txt 2>&1
+fi
+
+if [ $? -ne 0 ]
+then
+ echo check FAIL >> "$config"
+ exit 1
+fi
+
+
+rate=`echo -e "8000\n12000\n16000\n24000\n48000" | shuf -n1`
+echo testvectors for "$rate" Hz > testvectors_output.txt
+../../../run_vectors.sh . "$vectors" "$rate" >> testvectors_output.txt 2>&1
+
+if [ $? -ne 0 ]
+then
+ echo testvectors FAIL >> "$config"
+ exit 1
+fi
+
+echo all tests PASS >> "$config"
+
+#When everything's good, do some cleaning up to save space
+make distclean > /dev/null 2>&1
+rm -f tmp.out
+gzip make_output.txt
diff --git a/tests/test_opus_api.c b/tests/test_opus_api.c
index fb385c63..0e7ed2cc 100644
--- a/tests/test_opus_api.c
+++ b/tests/test_opus_api.c
@@ -1298,7 +1298,7 @@ opus_int32 test_enc_api(void)
err=opus_encoder_ctl(enc,OPUS_GET_INBAND_FEC(null_int_ptr));
if(err!=OPUS_BAD_ARG)test_failed();
cfgs++;
- CHECK_SETGET(OPUS_SET_INBAND_FEC(i),OPUS_GET_INBAND_FEC(&i),-1,2,
+ CHECK_SETGET(OPUS_SET_INBAND_FEC(i),OPUS_GET_INBAND_FEC(&i),-1,3,
1,0,
" OPUS_SET_INBAND_FEC .......................... OK.\n",
" OPUS_GET_INBAND_FEC .......................... OK.\n")
diff --git a/tests/test_opus_common.h b/tests/test_opus_common.h
index d96c7d84..5fb924f4 100644
--- a/tests/test_opus_common.h
+++ b/tests/test_opus_common.h
@@ -81,5 +81,5 @@ static OPUS_INLINE void _test_failed(const char *file, int line)
abort();
}
#define test_failed() _test_failed(__FILE__, __LINE__);
-
+#define opus_test_assert(cond) {if (!(cond)) {test_failed();}}
void regression_test(void);
diff --git a/tests/test_opus_encode.c b/tests/test_opus_encode.c
index 00795a1e..d6e8e2d3 100644
--- a/tests/test_opus_encode.c
+++ b/tests/test_opus_encode.c
@@ -297,6 +297,7 @@ int run_test1(int no_fuzz)
/*FIXME: encoder api tests, fs!=48k, mono, VBR*/
fprintf(stdout," Encode+Decode tests.\n");
+ fflush(stdout);
enc = opus_encoder_create(48000, 2, OPUS_APPLICATION_VOIP, &err);
if(err != OPUS_OK || enc==NULL)test_failed();
@@ -466,6 +467,7 @@ int run_test1(int no_fuzz)
count++;
}while(i<(SSAMPLES-MAX_FRAME_SAMP));
fprintf(stdout," Mode %s FB encode %s, %6d bps OK.\n",mstrings[modes[j]],rc==0?" VBR":rc==1?"CVBR":" CBR",rate);
+ fflush(stdout);
}
}
@@ -543,6 +545,7 @@ int run_test1(int no_fuzz)
count++;
}while(i<(SSAMPLES/12-MAX_FRAME_SAMP));
fprintf(stdout," Mode %s NB dual-mono MS encode %s, %6d bps OK.\n",mstrings[modes[j]],rc==0?" VBR":rc==1?"CVBR":" CBR",rate);
+ fflush(stdout);
}
}
@@ -612,6 +615,7 @@ int run_test1(int no_fuzz)
i+=frame_size;
}while(i<SAMPLES*4);
fprintf(stdout," All framesize pairs switching encode, %d frames OK.\n",count);
+ fflush(stdout);
if(opus_encoder_ctl(enc, OPUS_RESET_STATE)!=OPUS_OK)test_failed();
opus_encoder_destroy(enc);
diff --git a/tests/test_opus_padding.c b/tests/test_opus_padding.c
index c22e8f0d..c9ef7375 100644
--- a/tests/test_opus_padding.c
+++ b/tests/test_opus_padding.c
@@ -39,7 +39,7 @@
#define CHANNELS 2
#define FRAMESIZE 5760
-int test_overflow(void)
+void test_overflow(void)
{
OpusDecoder *decoder;
int result;
@@ -51,7 +51,7 @@ int test_overflow(void)
fprintf(stderr, " Checking for padding overflow... ");
if (!in || !out) {
fprintf(stderr, "FAIL (out of memory)\n");
- return -1;
+ test_failed();
}
in[0] = 0xff;
in[1] = 0x41;
@@ -71,21 +71,18 @@ int test_overflow(void)
}
fprintf(stderr, "OK.\n");
-
- return 1;
}
int main(void)
{
const char *oversion;
- int tests = 0;;
iseed = 0;
oversion = opus_get_version_string();
if (!oversion) test_failed();
fprintf(stderr, "Testing %s padding.\n", oversion);
- tests += test_overflow();
+ test_overflow();
fprintf(stderr, "All padding tests passed.\n");
diff --git a/tests/test_opus_projection.c b/tests/test_opus_projection.c
index 5f0d672c..4e06613e 100644
--- a/tests/test_opus_projection.c
+++ b/tests/test_opus_projection.c
@@ -29,7 +29,6 @@
#include "config.h"
#endif
-#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>