diff options
author | Philipp Hancke <fippo@sip-communicator.org> | 2020-07-16 09:47:24 +0200 |
---|---|---|
committer | Commit Bot <commit-bot@chromium.org> | 2020-07-16 11:37:35 +0000 |
commit | 686a3709acfedcf0a4c798dd1c5902787c4a266b (patch) | |
tree | 41c3108d761aa966afa2472ff04872b3f404c6dc /modules/audio_coding | |
parent | 3592839896082c72c8e92f9ebde788ef77b88043 (diff) | |
download | webrtc-686a3709acfedcf0a4c798dd1c5902787c4a266b.tar.gz |
opus: take SILK vad result into account for voice detection
BUG=webrtc:11643
Change-Id: Idc3a9b6bb7bd1a33f905843e5d6067ae19d5172c
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/176508
Commit-Queue: Minyue Li <minyue@webrtc.org>
Reviewed-by: Minyue Li <minyue@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#31743}
Diffstat (limited to 'modules/audio_coding')
-rw-r--r-- | modules/audio_coding/codecs/opus/audio_encoder_opus.cc | 22 | ||||
-rw-r--r-- | modules/audio_coding/codecs/opus/audio_encoder_opus.h | 1 | ||||
-rw-r--r-- | modules/audio_coding/codecs/opus/opus_interface.cc | 2 | ||||
-rw-r--r-- | modules/audio_coding/codecs/opus/opus_unittest.cc | 17 | ||||
-rw-r--r-- | modules/audio_coding/test/TestVADDTX.cc | 41 |
5 files changed, 60 insertions, 23 deletions
diff --git a/modules/audio_coding/codecs/opus/audio_encoder_opus.cc b/modules/audio_coding/codecs/opus/audio_encoder_opus.cc index 220e96f1b7..2b16920714 100644 --- a/modules/audio_coding/codecs/opus/audio_encoder_opus.cc +++ b/modules/audio_coding/codecs/opus/audio_encoder_opus.cc @@ -367,8 +367,7 @@ AudioEncoderOpusImpl::AudioEncoderOpusImpl( inst_(nullptr), packet_loss_fraction_smoother_(new PacketLossFractionSmoother()), audio_network_adaptor_creator_(audio_network_adaptor_creator), - bitrate_smoother_(std::move(bitrate_smoother)), - consecutive_dtx_frames_(0) { + bitrate_smoother_(std::move(bitrate_smoother)) { RTC_DCHECK(0 <= payload_type && payload_type <= 127); // Sanity check of the redundant payload type field that we want to get rid @@ -590,6 +589,7 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl( Num10msFramesPerPacket() * SamplesPer10msFrame()); const size_t max_encoded_bytes = SufficientOutputBufferSize(); + const size_t start_offset_bytes = encoded->size(); EncodedInfo info; info.encoded_bytes = encoded->AppendData( max_encoded_bytes, [&](rtc::ArrayView<uint8_t> encoded) { @@ -604,8 +604,6 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl( }); input_buffer_.clear(); - bool dtx_frame = (info.encoded_bytes <= 2); - // Will use new packet size for next encoding. config_.frame_size_ms = next_frame_length_ms_; @@ -620,14 +618,18 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl( info.encoded_timestamp = first_timestamp_in_buffer_; info.payload_type = payload_type_; info.send_even_if_empty = true; // Allows Opus to send empty packets. - // After 20 DTX frames (MAX_CONSECUTIVE_DTX) Opus will send a frame - // coding the background noise. Avoid flagging this frame as speech - // (even though there is a probability of the frame being speech). - info.speech = !dtx_frame && (consecutive_dtx_frames_ != 20); info.encoder_type = CodecType::kOpus; - // Increase or reset DTX counter. - consecutive_dtx_frames_ = (dtx_frame) ? (consecutive_dtx_frames_ + 1) : (0); + // Extract the VAD result from the encoded packet. + int has_voice = WebRtcOpus_PacketHasVoiceActivity( + &encoded->data()[start_offset_bytes], info.encoded_bytes); + if (has_voice == -1) { + // CELT mode packet or there was an error. This had set the speech flag to + // true historically. + info.speech = true; + } else { + info.speech = has_voice; + } return info; } diff --git a/modules/audio_coding/codecs/opus/audio_encoder_opus.h b/modules/audio_coding/codecs/opus/audio_encoder_opus.h index ab954feba7..dc955cec23 100644 --- a/modules/audio_coding/codecs/opus/audio_encoder_opus.h +++ b/modules/audio_coding/codecs/opus/audio_encoder_opus.h @@ -172,7 +172,6 @@ class AudioEncoderOpusImpl final : public AudioEncoder { absl::optional<size_t> overhead_bytes_per_packet_; const std::unique_ptr<SmoothingFilter> bitrate_smoother_; absl::optional<int64_t> bitrate_smoother_last_update_time_; - int consecutive_dtx_frames_; friend struct AudioEncoderOpus; RTC_DISALLOW_COPY_AND_ASSIGN(AudioEncoderOpusImpl); diff --git a/modules/audio_coding/codecs/opus/opus_interface.cc b/modules/audio_coding/codecs/opus/opus_interface.cc index ca39ed8235..455f175464 100644 --- a/modules/audio_coding/codecs/opus/opus_interface.cc +++ b/modules/audio_coding/codecs/opus/opus_interface.cc @@ -767,7 +767,7 @@ int WebRtcOpus_PacketHasVoiceActivity(const uint8_t* payload, int silk_frames = WebRtcOpus_NumSilkFrames(payload); if (silk_frames == 0) - return -1; + return 0; const int channels = opus_packet_get_nb_channels(payload); RTC_DCHECK(channels == 1 || channels == 2); diff --git a/modules/audio_coding/codecs/opus/opus_unittest.cc b/modules/audio_coding/codecs/opus/opus_unittest.cc index 80cab50137..66ac5e7346 100644 --- a/modules/audio_coding/codecs/opus/opus_unittest.cc +++ b/modules/audio_coding/codecs/opus/opus_unittest.cc @@ -975,4 +975,21 @@ TEST(OpusVadTest, TwoOpusMonoFramesVadOnSecond) { EXPECT_TRUE(WebRtcOpus_PacketHasVoiceActivity(twoMonoFrames, 3)); } +TEST(OpusVadTest, DtxEmptyPacket) { + const uint8_t dtx[] = {0x78}; + EXPECT_FALSE(WebRtcOpus_PacketHasVoiceActivity(dtx, 1)); +} + +TEST(OpusVadTest, DtxBackgroundNoisePacket) { + // DTX sends a frame coding background noise every 20 packets: + // https://tools.ietf.org/html/rfc6716#section-2.1.9 + // The packet below represents such a frame and was captured using + // Wireshark while disabling encryption. + const uint8_t dtx[] = {0x78, 0x07, 0xc9, 0x79, 0xc8, 0xc9, 0x57, 0xc0, 0xa2, + 0x12, 0x23, 0xfa, 0xef, 0x67, 0xf3, 0x2e, 0xe3, 0xd3, + 0xd5, 0xe9, 0xec, 0xdb, 0x3e, 0xbc, 0x80, 0xb6, 0x6e, + 0x2a, 0xb7, 0x8c, 0x83, 0xcd, 0x83, 0xcd, 0x00}; + EXPECT_FALSE(WebRtcOpus_PacketHasVoiceActivity(dtx, 35)); +} + } // namespace webrtc diff --git a/modules/audio_coding/test/TestVADDTX.cc b/modules/audio_coding/test/TestVADDTX.cc index c493e64ee0..6c9b14ddb7 100644 --- a/modules/audio_coding/test/TestVADDTX.cc +++ b/modules/audio_coding/test/TestVADDTX.cc @@ -166,11 +166,13 @@ void TestVadDtx::Run(std::string in_filename, int i = &st - stats; // Calculate the current position in stats. switch (expects[i]) { case 0: { - EXPECT_EQ(0u, st) << "stats[" << i << "] error."; + EXPECT_EQ(0u, st) << "stats[" << i << "] error. Output file " + << out_filename; break; } case 1: { - EXPECT_GT(st, 0u) << "stats[" << i << "] error."; + EXPECT_GT(st, 0u) << "stats[" << i << "] error. Output file " + << out_filename; break; } } @@ -189,25 +191,29 @@ void TestWebRtcVadDtx::Perform() { // Test various configurations on VAD/DTX. void TestWebRtcVadDtx::RunTestCases(const SdpAudioFormat& codec_format) { + RegisterCodec(codec_format, absl::nullopt); Test(/*new_outfile=*/true, - /*expect_dtx_enabled=*/RegisterCodec(codec_format, absl::nullopt)); + /*expect_vad_packets=*/codec_format.name == "opus"); + RegisterCodec(codec_format, Vad::kVadAggressive); Test(/*new_outfile=*/false, - /*expect_dtx_enabled=*/RegisterCodec(codec_format, Vad::kVadAggressive)); + /*expect_vad_packets=*/true); + RegisterCodec(codec_format, Vad::kVadLowBitrate); Test(/*new_outfile=*/false, - /*expect_dtx_enabled=*/RegisterCodec(codec_format, Vad::kVadLowBitrate)); + /*expect_vad_packets=*/true); - Test(/*new_outfile=*/false, /*expect_dtx_enabled=*/RegisterCodec( - codec_format, Vad::kVadVeryAggressive)); + RegisterCodec(codec_format, Vad::kVadVeryAggressive); + Test(/*new_outfile=*/false, /*expect_vad_packets=*/true); + RegisterCodec(codec_format, Vad::kVadNormal); Test(/*new_outfile=*/false, - /*expect_dtx_enabled=*/RegisterCodec(codec_format, Vad::kVadNormal)); + /*expect_vad_packets=*/true); } // Set the expectation and run the test. -void TestWebRtcVadDtx::Test(bool new_outfile, bool expect_dtx_enabled) { - int expects[] = {-1, 1, expect_dtx_enabled, 0, 0}; +void TestWebRtcVadDtx::Test(bool new_outfile, bool expect_vad_packets) { + int expects[] = {-1, 1, expect_vad_packets ? 1 : -1, 0, 0}; if (new_outfile) { output_file_num_++; } @@ -220,16 +226,20 @@ void TestWebRtcVadDtx::Test(bool new_outfile, bool expect_dtx_enabled) { // Following is the implementation of TestOpusDtx. void TestOpusDtx::Perform() { - int expects[] = {0, 1, 0, 0, 0}; + int expects[] = {0, 0, 0, 0, 0}; // Register Opus as send codec std::string out_filename = webrtc::test::OutputPath() + "testOpusDtx_outFile_mono.pcm"; RegisterCodec({"opus", 48000, 2}, absl::nullopt); + acm_send_->ModifyEncoder([](std::unique_ptr<AudioEncoder>* encoder_ptr) { (*encoder_ptr)->SetDtx(false); }); + expects[static_cast<int>(AudioFrameType::kEmptyFrame)] = 0; + expects[static_cast<int>(AudioFrameType::kAudioFrameSpeech)] = 1; + expects[static_cast<int>(AudioFrameType::kAudioFrameCN)] = 1; Run(webrtc::test::ResourcePath("audio_coding/testfile32kHz", "pcm"), 32000, 1, out_filename, false, expects); @@ -237,6 +247,7 @@ void TestOpusDtx::Perform() { (*encoder_ptr)->SetDtx(true); }); expects[static_cast<int>(AudioFrameType::kEmptyFrame)] = 1; + expects[static_cast<int>(AudioFrameType::kAudioFrameSpeech)] = 1; expects[static_cast<int>(AudioFrameType::kAudioFrameCN)] = 1; Run(webrtc::test::ResourcePath("audio_coding/testfile32kHz", "pcm"), 32000, 1, out_filename, true, expects); @@ -244,10 +255,12 @@ void TestOpusDtx::Perform() { // Register stereo Opus as send codec out_filename = webrtc::test::OutputPath() + "testOpusDtx_outFile_stereo.pcm"; RegisterCodec({"opus", 48000, 2, {{"stereo", "1"}}}, absl::nullopt); + acm_send_->ModifyEncoder([](std::unique_ptr<AudioEncoder>* encoder_ptr) { (*encoder_ptr)->SetDtx(false); }); expects[static_cast<int>(AudioFrameType::kEmptyFrame)] = 0; + expects[static_cast<int>(AudioFrameType::kAudioFrameSpeech)] = 1; expects[static_cast<int>(AudioFrameType::kAudioFrameCN)] = 0; Run(webrtc::test::ResourcePath("audio_coding/teststereo32kHz", "pcm"), 32000, 2, out_filename, false, expects); @@ -257,7 +270,13 @@ void TestOpusDtx::Perform() { }); expects[static_cast<int>(AudioFrameType::kEmptyFrame)] = 1; + expects[static_cast<int>(AudioFrameType::kAudioFrameSpeech)] = 1; + // Android and iOS behave different with respect to the number of CN frames. +#if defined(WEBRTC_IOS) || defined(WEBRTC_ANDROID) expects[static_cast<int>(AudioFrameType::kAudioFrameCN)] = 1; +#else + expects[static_cast<int>(AudioFrameType::kAudioFrameCN)] = 0; +#endif Run(webrtc::test::ResourcePath("audio_coding/teststereo32kHz", "pcm"), 32000, 2, out_filename, true, expects); } |