diff options
Diffstat (limited to 'src/common_audio/vad/main/source/vad_core.c')
-rw-r--r-- | src/common_audio/vad/main/source/vad_core.c | 685 |
1 files changed, 685 insertions, 0 deletions
diff --git a/src/common_audio/vad/main/source/vad_core.c b/src/common_audio/vad/main/source/vad_core.c new file mode 100644 index 0000000000..e8829993d5 --- /dev/null +++ b/src/common_audio/vad/main/source/vad_core.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This file includes the implementation of the core functionality in VAD. + * For function description, see vad_core.h. + */ + +#include "vad_core.h" +#include "vad_const.h" +#include "vad_defines.h" +#include "vad_filterbank.h" +#include "vad_gmm.h" +#include "vad_sp.h" +#include "signal_processing_library.h" + +static const int kInitCheck = 42; + +// Initialize VAD +int WebRtcVad_InitCore(VadInstT *inst, short mode) +{ + int i; + + // Initialization of struct + inst->vad = 1; + inst->frame_counter = 0; + inst->over_hang = 0; + inst->num_of_speech = 0; + + // Initialization of downsampling filter state + inst->downsampling_filter_states[0] = 0; + inst->downsampling_filter_states[1] = 0; + inst->downsampling_filter_states[2] = 0; + inst->downsampling_filter_states[3] = 0; + + // Read initial PDF parameters + for (i = 0; i < NUM_TABLE_VALUES; i++) + { + inst->noise_means[i] = kNoiseDataMeans[i]; + inst->speech_means[i] = kSpeechDataMeans[i]; + inst->noise_stds[i] = kNoiseDataStds[i]; + inst->speech_stds[i] = kSpeechDataStds[i]; + } + + // Index and Minimum value vectors are initialized + for (i = 0; i < 16 * NUM_CHANNELS; i++) + { + inst->low_value_vector[i] = 10000; + inst->index_vector[i] = 0; + } + + for (i = 0; i < 5; i++) + { + inst->upper_state[i] = 0; + inst->lower_state[i] = 0; + } + + for (i = 0; i < 4; i++) + { + inst->hp_filter_state[i] = 0; + } + + // Init mean value memory, for FindMin function + inst->mean_value[0] = 1600; + inst->mean_value[1] = 1600; + inst->mean_value[2] = 1600; + inst->mean_value[3] = 1600; + inst->mean_value[4] = 1600; + inst->mean_value[5] = 1600; + + if (mode == 0) + { + // Quality mode + inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_Q; + inst->individual[1] = INDIVIDUAL_20MS_Q; + inst->individual[2] = INDIVIDUAL_30MS_Q; + + inst->total[0] = TOTAL_10MS_Q; + inst->total[1] = TOTAL_20MS_Q; + inst->total[2] = TOTAL_30MS_Q; + } else if (mode == 1) + { + // Low bitrate mode + inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_LBR; + inst->individual[1] = INDIVIDUAL_20MS_LBR; + inst->individual[2] = INDIVIDUAL_30MS_LBR; + + inst->total[0] = TOTAL_10MS_LBR; + inst->total[1] = TOTAL_20MS_LBR; + inst->total[2] = TOTAL_30MS_LBR; + } else if (mode == 2) + { + // Aggressive mode + inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_AGG; + inst->individual[1] = INDIVIDUAL_20MS_AGG; + inst->individual[2] = INDIVIDUAL_30MS_AGG; + + inst->total[0] = TOTAL_10MS_AGG; + inst->total[1] = TOTAL_20MS_AGG; + inst->total[2] = TOTAL_30MS_AGG; + } else + { + // Very aggressive mode + inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_VAG; + inst->individual[1] = INDIVIDUAL_20MS_VAG; + inst->individual[2] = INDIVIDUAL_30MS_VAG; + + inst->total[0] = TOTAL_10MS_VAG; + inst->total[1] = TOTAL_20MS_VAG; + inst->total[2] = TOTAL_30MS_VAG; + } + + inst->init_flag = kInitCheck; + + return 0; +} + +// Set aggressiveness mode +int WebRtcVad_set_mode_core(VadInstT *inst, short mode) +{ + + if (mode == 0) + { + // Quality mode + inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_Q; + inst->individual[1] = INDIVIDUAL_20MS_Q; + inst->individual[2] = INDIVIDUAL_30MS_Q; + + inst->total[0] = TOTAL_10MS_Q; + inst->total[1] = TOTAL_20MS_Q; + inst->total[2] = TOTAL_30MS_Q; + } else if (mode == 1) + { + // Low bitrate mode + inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_LBR; + inst->individual[1] = INDIVIDUAL_20MS_LBR; + inst->individual[2] = INDIVIDUAL_30MS_LBR; + + inst->total[0] = TOTAL_10MS_LBR; + inst->total[1] = TOTAL_20MS_LBR; + inst->total[2] = TOTAL_30MS_LBR; + } else if (mode == 2) + { + // Aggressive mode + inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_AGG; + inst->individual[1] = INDIVIDUAL_20MS_AGG; + inst->individual[2] = INDIVIDUAL_30MS_AGG; + + inst->total[0] = TOTAL_10MS_AGG; + inst->total[1] = TOTAL_20MS_AGG; + inst->total[2] = TOTAL_30MS_AGG; + } else if (mode == 3) + { + // Very aggressive mode + inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_VAG; + inst->individual[1] = INDIVIDUAL_20MS_VAG; + inst->individual[2] = INDIVIDUAL_30MS_VAG; + + inst->total[0] = TOTAL_10MS_VAG; + inst->total[1] = TOTAL_20MS_VAG; + inst->total[2] = TOTAL_30MS_VAG; + } else + { + return -1; + } + + return 0; +} + +// Calculate VAD decision by first extracting feature values and then calculate +// probability for both speech and background noise. + +WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame, + int frame_length) +{ + WebRtc_Word16 len, vad; + WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB) + WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) + + + // Downsample signal 32->16->8 before doing VAD + WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]), + frame_length); + len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); + + WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len); + len = WEBRTC_SPL_RSHIFT_W16(len, 1); + + // Do VAD on an 8 kHz signal + vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); + + return vad; +} + +WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame, + int frame_length) +{ + WebRtc_Word16 len, vad; + WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) + + // Wideband: Downsample signal before doing VAD + WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states, + frame_length); + + len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); + vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); + + return vad; +} + +WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame, + int frame_length) +{ + WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power; + + // Get power in the bands + total_power = WebRtcVad_get_features(inst, speech_frame, frame_length, feature_vector); + + // Make a VAD + inst->vad = WebRtcVad_GmmProbability(inst, feature_vector, total_power, frame_length); + + return inst->vad; +} + +// Calculate probability for both speech and background noise, and perform a +// hypothesis-test. +WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector, + WebRtc_Word16 total_power, int frame_length) +{ + int n, k; + WebRtc_Word16 backval; + WebRtc_Word16 h0, h1; + WebRtc_Word16 ratvec, xval; + WebRtc_Word16 vadflag; + WebRtc_Word16 shifts0, shifts1; + WebRtc_Word16 tmp16, tmp16_1, tmp16_2; + WebRtc_Word16 diff, nr, pos; + WebRtc_Word16 nmk, nmk2, nmk3, smk, smk2, nsk, ssk; + WebRtc_Word16 delt, ndelt; + WebRtc_Word16 maxspe, maxmu; + WebRtc_Word16 deltaN[NUM_TABLE_VALUES], deltaS[NUM_TABLE_VALUES]; + WebRtc_Word16 ngprvec[NUM_TABLE_VALUES], sgprvec[NUM_TABLE_VALUES]; + WebRtc_Word32 h0test, h1test; + WebRtc_Word32 tmp32_1, tmp32_2; + WebRtc_Word32 dotVal; + WebRtc_Word32 nmid, smid; + WebRtc_Word32 probn[NUM_MODELS], probs[NUM_MODELS]; + WebRtc_Word16 *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr, + *sstd1ptr, *sstd2ptr; + WebRtc_Word16 overhead1, overhead2, individualTest, totalTest; + + // Set the thresholds to different values based on frame length + if (frame_length == 80) + { + // 80 input samples + overhead1 = inst->over_hang_max_1[0]; + overhead2 = inst->over_hang_max_2[0]; + individualTest = inst->individual[0]; + totalTest = inst->total[0]; + } else if (frame_length == 160) + { + // 160 input samples + overhead1 = inst->over_hang_max_1[1]; + overhead2 = inst->over_hang_max_2[1]; + individualTest = inst->individual[1]; + totalTest = inst->total[1]; + } else + { + // 240 input samples + overhead1 = inst->over_hang_max_1[2]; + overhead2 = inst->over_hang_max_2[2]; + individualTest = inst->individual[2]; + totalTest = inst->total[2]; + } + + if (total_power > MIN_ENERGY) + { // If signal present at all + + // Set pointers to the gaussian parameters + nmean1ptr = &inst->noise_means[0]; + nmean2ptr = &inst->noise_means[NUM_CHANNELS]; + smean1ptr = &inst->speech_means[0]; + smean2ptr = &inst->speech_means[NUM_CHANNELS]; + nstd1ptr = &inst->noise_stds[0]; + nstd2ptr = &inst->noise_stds[NUM_CHANNELS]; + sstd1ptr = &inst->speech_stds[0]; + sstd2ptr = &inst->speech_stds[NUM_CHANNELS]; + + vadflag = 0; + dotVal = 0; + for (n = 0; n < NUM_CHANNELS; n++) + { // For all channels + + pos = WEBRTC_SPL_LSHIFT_W16(n, 1); + xval = feature_vector[n]; + + // Probability for Noise, Q7 * Q20 = Q27 + tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++, + &deltaN[pos]); + probn[0] = (WebRtc_Word32)(kNoiseDataWeights[n] * tmp32_1); + tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++, + &deltaN[pos + 1]); + probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + NUM_CHANNELS] * tmp32_1); + h0test = probn[0] + probn[1]; // Q27 + h0 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15 + + // Probability for Speech + tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++, + &deltaS[pos]); + probs[0] = (WebRtc_Word32)(kSpeechDataWeights[n] * tmp32_1); + tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++, + &deltaS[pos + 1]); + probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + NUM_CHANNELS] * tmp32_1); + h1test = probs[0] + probs[1]; // Q27 + h1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15 + + // Get likelihood ratio. Approximate log2(H1/H0) with shifts0 - shifts1 + shifts0 = WebRtcSpl_NormW32(h0test); + shifts1 = WebRtcSpl_NormW32(h1test); + + if ((h0test > 0) && (h1test > 0)) + { + ratvec = shifts0 - shifts1; + } else if (h1test > 0) + { + ratvec = 31 - shifts1; + } else if (h0test > 0) + { + ratvec = shifts0 - 31; + } else + { + ratvec = 0; + } + + // VAD decision with spectrum weighting + dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]); + + // Individual channel test + if ((ratvec << 2) > individualTest) + { + vadflag = 1; + } + + // Probabilities used when updating model + if (h0 > 0) + { + tmp32_1 = probn[0] & 0xFFFFF000; // Q27 + tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); // Q29 + ngprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h0); + ngprvec[pos + 1] = 16384 - ngprvec[pos]; + } else + { + ngprvec[pos] = 16384; + ngprvec[pos + 1] = 0; + } + + // Probabilities used when updating model + if (h1 > 0) + { + tmp32_1 = probs[0] & 0xFFFFF000; + tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); + sgprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h1); + sgprvec[pos + 1] = 16384 - sgprvec[pos]; + } else + { + sgprvec[pos] = 0; + sgprvec[pos + 1] = 0; + } + } + + // Overall test + if (dotVal >= totalTest) + { + vadflag |= 1; + } + + // Set pointers to the means and standard deviations. + nmean1ptr = &inst->noise_means[0]; + smean1ptr = &inst->speech_means[0]; + nstd1ptr = &inst->noise_stds[0]; + sstd1ptr = &inst->speech_stds[0]; + + maxspe = 12800; + + // Update the model's parameters + for (n = 0; n < NUM_CHANNELS; n++) + { + + pos = WEBRTC_SPL_LSHIFT_W16(n, 1); + + // Get min value in past which is used for long term correction + backval = WebRtcVad_FindMinimum(inst, feature_vector[n], n); // Q4 + + // Compute the "global" mean, that is the sum of the two means weighted + nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7 + nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], + *(nmean1ptr+NUM_CHANNELS)); + tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8 + + for (k = 0; k < NUM_MODELS; k++) + { + + nr = pos + k; + + nmean2ptr = nmean1ptr + k * NUM_CHANNELS; + smean2ptr = smean1ptr + k * NUM_CHANNELS; + nstd2ptr = nstd1ptr + k * NUM_CHANNELS; + sstd2ptr = sstd1ptr + k * NUM_CHANNELS; + nmk = *nmean2ptr; + smk = *smean2ptr; + nsk = *nstd2ptr; + ssk = *sstd2ptr; + + // Update noise mean vector if the frame consists of noise only + nmk2 = nmk; + if (!vadflag) + { + // deltaN = (x-mu)/sigma^2 + // ngprvec[k] = probn[k]/(probn[0] + probn[1]) + + delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr], + deltaN[nr], 11); // Q14*Q11 + nmk2 = nmk + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt, + kNoiseUpdateConst, + 22); // Q7+(Q14*Q15>>22) + } + + // Long term correction of the noise mean + ndelt = WEBRTC_SPL_LSHIFT_W16(backval, 4); + ndelt -= tmp16_1; // Q8 - Q8 + nmk3 = nmk2 + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ndelt, + kBackEta, + 9); // Q7+(Q8*Q8)>>9 + + // Control that the noise mean does not drift to much + tmp16 = WEBRTC_SPL_LSHIFT_W16(k+5, 7); + if (nmk3 < tmp16) + nmk3 = tmp16; + tmp16 = WEBRTC_SPL_LSHIFT_W16(72+k-n, 7); + if (nmk3 > tmp16) + nmk3 = tmp16; + *nmean2ptr = nmk3; + + if (vadflag) + { + // Update speech mean vector: + // deltaS = (x-mu)/sigma^2 + // sgprvec[k] = probn[k]/(probn[0] + probn[1]) + + delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr], + deltaS[nr], + 11); // (Q14*Q11)>>11=Q14 + tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt, + kSpeechUpdateConst, + 21) + 1; + smk2 = smk + (tmp16 >> 1); // Q7 + (Q14 * Q15 >> 22) + + // Control that the speech mean does not drift to much + maxmu = maxspe + 640; + if (smk2 < kMinimumMean[k]) + smk2 = kMinimumMean[k]; + if (smk2 > maxmu) + smk2 = maxmu; + + *smean2ptr = smk2; + + // (Q7>>3) = Q4 + tmp16 = WEBRTC_SPL_RSHIFT_W16((smk + 4), 3); + + tmp16 = feature_vector[n] - tmp16; // Q4 + tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3); + tmp32_2 = tmp32_1 - (WebRtc_Word32)4096; // Q12 + tmp16 = WEBRTC_SPL_RSHIFT_W16((sgprvec[nr]), 2); + tmp32_1 = (WebRtc_Word32)(tmp16 * tmp32_2);// (Q15>>3)*(Q14>>2)=Q12*Q12=Q24 + + tmp32_2 = WEBRTC_SPL_RSHIFT_W32(tmp32_1, 4); // Q20 + + // 0.1 * Q20 / Q7 = Q13 + if (tmp32_2 > 0) + tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, ssk * 10); + else + { + tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10); + tmp16 = -tmp16; + } + // divide by 4 giving an update factor of 0.025 + tmp16 += 128; // Rounding + ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8); + // Division with 8 plus Q7 + if (ssk < MIN_STD) + ssk = MIN_STD; + *sstd2ptr = ssk; + } else + { + // Update GMM variance vectors + // deltaN * (feature_vector[n] - nmk) - 1, Q11 * Q4 + tmp16 = feature_vector[n] - WEBRTC_SPL_RSHIFT_W16(nmk, 3); + + // (Q15>>3) * (Q14>>2) = Q12 * Q12 = Q24 + tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096; + tmp16 = WEBRTC_SPL_RSHIFT_W16((ngprvec[nr]+2), 2); + tmp32_2 = (WebRtc_Word32)(tmp16 * tmp32_1); + tmp32_1 = WEBRTC_SPL_RSHIFT_W32(tmp32_2, 14); + // Q20 * approx 0.001 (2^-10=0.0009766) + + // Q20 / Q7 = Q13 + tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk); + if (tmp32_1 > 0) + tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk); + else + { + tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_1, nsk); + tmp16 = -tmp16; + } + tmp16 += 32; // Rounding + nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6); + + if (nsk < MIN_STD) + nsk = MIN_STD; + + *nstd2ptr = nsk; + } + } + + // Separate models if they are too close - nmid in Q14 + nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); + nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], *nmean2ptr); + + // smid in Q14 + smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr); + smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+NUM_CHANNELS], *smean2ptr); + + // diff = "global" speech mean - "global" noise mean + diff = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 9); + tmp16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 9); + diff -= tmp16; + + if (diff < kMinimumDifference[n]) + { + + tmp16 = kMinimumDifference[n] - diff; // Q5 + + // tmp16_1 = ~0.8 * (kMinimumDifference - diff) in Q7 + // tmp16_2 = ~0.2 * (kMinimumDifference - diff) in Q7 + tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2); + tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2); + + // First Gauss, speech model + tmp16 = tmp16_1 + *smean1ptr; + *smean1ptr = tmp16; + smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]); + + // Second Gauss, speech model + tmp16 = tmp16_1 + *smean2ptr; + *smean2ptr = tmp16; + smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+NUM_CHANNELS]); + + // First Gauss, noise model + tmp16 = *nmean1ptr - tmp16_2; + *nmean1ptr = tmp16; + + nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]); + + // Second Gauss, noise model + tmp16 = *nmean2ptr - tmp16_2; + *nmean2ptr = tmp16; + nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+NUM_CHANNELS]); + } + + // Control that the speech & noise means do not drift to much + maxspe = kMaximumSpeech[n]; + tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 7); + if (tmp16_2 > maxspe) + { // Upper limit of speech model + tmp16_2 -= maxspe; + + *smean1ptr -= tmp16_2; + *smean2ptr -= tmp16_2; + } + + tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 7); + if (tmp16_2 > kMaximumNoise[n]) + { + tmp16_2 -= kMaximumNoise[n]; + + *nmean1ptr -= tmp16_2; + *nmean2ptr -= tmp16_2; + } + + *nmean1ptr++; + *smean1ptr++; + *nstd1ptr++; + *sstd1ptr++; + } + inst->frame_counter++; + } else + { + vadflag = 0; + } + + // Hangover smoothing + if (!vadflag) + { + if (inst->over_hang > 0) + { + vadflag = 2 + inst->over_hang; + inst->over_hang = inst->over_hang - 1; + } + inst->num_of_speech = 0; + } else + { + inst->num_of_speech = inst->num_of_speech + 1; + if (inst->num_of_speech > NSP_MAX) + { + inst->num_of_speech = NSP_MAX; + inst->over_hang = overhead2; + } else + inst->over_hang = overhead1; + } + return vadflag; +} |