opus: take SILK vad result into account for voice detection

BUG=webrtc:11643 Change-Id: Idc3a9b6bb7bd1a33f905843e5d6067ae19d5172c Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/176508 Commit-Queue: Minyue Li <minyue@webrtc.org> Reviewed-by: Minyue Li <minyue@webrtc.org> Cr-Commit-Position: refs/heads/master@{#31743}
author: Philipp Hancke <fippo@sip-communicator.org> 2020-07-16 09:47:24 +0200
committer: Commit Bot <commit-bot@chromium.org> 2020-07-16 11:37:35 +0000
commit: 686a3709acfedcf0a4c798dd1c5902787c4a266b (patch)
tree: 41c3108d761aa966afa2472ff04872b3f404c6dc /modules/audio_coding
parent: 3592839896082c72c8e92f9ebde788ef77b88043 (diff)
download: webrtc-686a3709acfedcf0a4c798dd1c5902787c4a266b.tar.gz
5 files changed, 60 insertions, 23 deletions
diff --git a/modules/audio_coding/codecs/opus/audio_encoder_opus.cc b/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
index 220e96f1b7..2b16920714 100644
--- a/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
@@ -367,8 +367,7 @@ AudioEncoderOpusImpl::AudioEncoderOpusImpl(
       inst_(nullptr),
       packet_loss_fraction_smoother_(new PacketLossFractionSmoother()),
       audio_network_adaptor_creator_(audio_network_adaptor_creator),
-      bitrate_smoother_(std::move(bitrate_smoother)),
-      consecutive_dtx_frames_(0) {
+      bitrate_smoother_(std::move(bitrate_smoother)) {
   RTC_DCHECK(0 <= payload_type && payload_type <= 127);
 
   // Sanity check of the redundant payload type field that we want to get rid
@@ -590,6 +589,7 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl(
                Num10msFramesPerPacket() * SamplesPer10msFrame());
 
   const size_t max_encoded_bytes = SufficientOutputBufferSize();
+  const size_t start_offset_bytes = encoded->size();
   EncodedInfo info;
   info.encoded_bytes = encoded->AppendData(
       max_encoded_bytes, [&](rtc::ArrayView<uint8_t> encoded) {
@@ -604,8 +604,6 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl(
       });
   input_buffer_.clear();
 
-  bool dtx_frame = (info.encoded_bytes <= 2);
-
   // Will use new packet size for next encoding.
   config_.frame_size_ms = next_frame_length_ms_;
 
@@ -620,14 +618,18 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl(
   info.encoded_timestamp = first_timestamp_in_buffer_;
   info.payload_type = payload_type_;
   info.send_even_if_empty = true;  // Allows Opus to send empty packets.
-  // After 20 DTX frames (MAX_CONSECUTIVE_DTX) Opus will send a frame
-  // coding the background noise. Avoid flagging this frame as speech
-  // (even though there is a probability of the frame being speech).
-  info.speech = !dtx_frame && (consecutive_dtx_frames_ != 20);
   info.encoder_type = CodecType::kOpus;
 
-  // Increase or reset DTX counter.
-  consecutive_dtx_frames_ = (dtx_frame) ? (consecutive_dtx_frames_ + 1) : (0);
+  // Extract the VAD result from the encoded packet.
+  int has_voice = WebRtcOpus_PacketHasVoiceActivity(
+      &encoded->data()[start_offset_bytes], info.encoded_bytes);
+  if (has_voice == -1) {
+    // CELT mode packet or there was an error. This had set the speech flag to
+    // true historically.
+    info.speech = true;
+  } else {
+    info.speech = has_voice;
+  }
 
   return info;
 }
diff --git a/modules/audio_coding/codecs/opus/audio_encoder_opus.h b/modules/audio_coding/codecs/opus/audio_encoder_opus.h
index ab954feba7..dc955cec23 100644
--- a/modules/audio_coding/codecs/opus/audio_encoder_opus.h
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus.h
@@ -172,7 +172,6 @@ class AudioEncoderOpusImpl final : public AudioEncoder {
   absl::optional<size_t> overhead_bytes_per_packet_;
   const std::unique_ptr<SmoothingFilter> bitrate_smoother_;
   absl::optional<int64_t> bitrate_smoother_last_update_time_;
-  int consecutive_dtx_frames_;
 
   friend struct AudioEncoderOpus;
   RTC_DISALLOW_COPY_AND_ASSIGN(AudioEncoderOpusImpl);
diff --git a/modules/audio_coding/codecs/opus/opus_interface.cc b/modules/audio_coding/codecs/opus/opus_interface.cc
index ca39ed8235..455f175464 100644
--- a/modules/audio_coding/codecs/opus/opus_interface.cc
+++ b/modules/audio_coding/codecs/opus/opus_interface.cc
@@ -767,7 +767,7 @@ int WebRtcOpus_PacketHasVoiceActivity(const uint8_t* payload,
 
   int silk_frames = WebRtcOpus_NumSilkFrames(payload);
   if (silk_frames == 0)
-    return -1;
+    return 0;
 
   const int channels = opus_packet_get_nb_channels(payload);
   RTC_DCHECK(channels == 1 || channels == 2);
diff --git a/modules/audio_coding/codecs/opus/opus_unittest.cc b/modules/audio_coding/codecs/opus/opus_unittest.cc
index 80cab50137..66ac5e7346 100644
--- a/modules/audio_coding/codecs/opus/opus_unittest.cc
+++ b/modules/audio_coding/codecs/opus/opus_unittest.cc
@@ -975,4 +975,21 @@ TEST(OpusVadTest, TwoOpusMonoFramesVadOnSecond) {
   EXPECT_TRUE(WebRtcOpus_PacketHasVoiceActivity(twoMonoFrames, 3));
 }
 
+TEST(OpusVadTest, DtxEmptyPacket) {
+  const uint8_t dtx[] = {0x78};
+  EXPECT_FALSE(WebRtcOpus_PacketHasVoiceActivity(dtx, 1));
+}
+
+TEST(OpusVadTest, DtxBackgroundNoisePacket) {
+  // DTX sends a frame coding background noise every 20 packets:
+  //   https://tools.ietf.org/html/rfc6716#section-2.1.9
+  // The packet below represents such a frame and was captured using
+  // Wireshark while disabling encryption.
+  const uint8_t dtx[] = {0x78, 0x07, 0xc9, 0x79, 0xc8, 0xc9, 0x57, 0xc0, 0xa2,
+                         0x12, 0x23, 0xfa, 0xef, 0x67, 0xf3, 0x2e, 0xe3, 0xd3,
+                         0xd5, 0xe9, 0xec, 0xdb, 0x3e, 0xbc, 0x80, 0xb6, 0x6e,
+                         0x2a, 0xb7, 0x8c, 0x83, 0xcd, 0x83, 0xcd, 0x00};
+  EXPECT_FALSE(WebRtcOpus_PacketHasVoiceActivity(dtx, 35));
+}
+
 }  // namespace webrtc
diff --git a/modules/audio_coding/test/TestVADDTX.cc b/modules/audio_coding/test/TestVADDTX.cc
index c493e64ee0..6c9b14ddb7 100644
--- a/modules/audio_coding/test/TestVADDTX.cc
+++ b/modules/audio_coding/test/TestVADDTX.cc
@@ -166,11 +166,13 @@ void TestVadDtx::Run(std::string in_filename,
     int i = &st - stats;  // Calculate the current position in stats.
     switch (expects[i]) {
       case 0: {
-        EXPECT_EQ(0u, st) << "stats[" << i << "] error.";
+        EXPECT_EQ(0u, st) << "stats[" << i << "] error. Output file "
+                          << out_filename;
         break;
       }
       case 1: {
-        EXPECT_GT(st, 0u) << "stats[" << i << "] error.";
+        EXPECT_GT(st, 0u) << "stats[" << i << "] error. Output file "
+                          << out_filename;
         break;
       }
     }
@@ -189,25 +191,29 @@ void TestWebRtcVadDtx::Perform() {
 
 // Test various configurations on VAD/DTX.
 void TestWebRtcVadDtx::RunTestCases(const SdpAudioFormat& codec_format) {
+  RegisterCodec(codec_format, absl::nullopt);
   Test(/*new_outfile=*/true,
-       /*expect_dtx_enabled=*/RegisterCodec(codec_format, absl::nullopt));
+       /*expect_vad_packets=*/codec_format.name == "opus");
 
+  RegisterCodec(codec_format, Vad::kVadAggressive);
   Test(/*new_outfile=*/false,
-       /*expect_dtx_enabled=*/RegisterCodec(codec_format, Vad::kVadAggressive));
+       /*expect_vad_packets=*/true);
 
+  RegisterCodec(codec_format, Vad::kVadLowBitrate);
   Test(/*new_outfile=*/false,
-       /*expect_dtx_enabled=*/RegisterCodec(codec_format, Vad::kVadLowBitrate));
+       /*expect_vad_packets=*/true);
 
-  Test(/*new_outfile=*/false, /*expect_dtx_enabled=*/RegisterCodec(
-           codec_format, Vad::kVadVeryAggressive));
+  RegisterCodec(codec_format, Vad::kVadVeryAggressive);
+  Test(/*new_outfile=*/false, /*expect_vad_packets=*/true);
 
+  RegisterCodec(codec_format, Vad::kVadNormal);
   Test(/*new_outfile=*/false,
-       /*expect_dtx_enabled=*/RegisterCodec(codec_format, Vad::kVadNormal));
+       /*expect_vad_packets=*/true);
 }
 
 // Set the expectation and run the test.
-void TestWebRtcVadDtx::Test(bool new_outfile, bool expect_dtx_enabled) {
-  int expects[] = {-1, 1, expect_dtx_enabled, 0, 0};
+void TestWebRtcVadDtx::Test(bool new_outfile, bool expect_vad_packets) {
+  int expects[] = {-1, 1, expect_vad_packets ? 1 : -1, 0, 0};
   if (new_outfile) {
     output_file_num_++;
   }
@@ -220,16 +226,20 @@ void TestWebRtcVadDtx::Test(bool new_outfile, bool expect_dtx_enabled) {
 
 // Following is the implementation of TestOpusDtx.
 void TestOpusDtx::Perform() {
-  int expects[] = {0, 1, 0, 0, 0};
+  int expects[] = {0, 0, 0, 0, 0};
 
   // Register Opus as send codec
   std::string out_filename =
       webrtc::test::OutputPath() + "testOpusDtx_outFile_mono.pcm";
   RegisterCodec({"opus", 48000, 2}, absl::nullopt);
+
   acm_send_->ModifyEncoder([](std::unique_ptr<AudioEncoder>* encoder_ptr) {
     (*encoder_ptr)->SetDtx(false);
   });
 
+  expects[static_cast<int>(AudioFrameType::kEmptyFrame)] = 0;
+  expects[static_cast<int>(AudioFrameType::kAudioFrameSpeech)] = 1;
+  expects[static_cast<int>(AudioFrameType::kAudioFrameCN)] = 1;
   Run(webrtc::test::ResourcePath("audio_coding/testfile32kHz", "pcm"), 32000, 1,
       out_filename, false, expects);
 
@@ -237,6 +247,7 @@ void TestOpusDtx::Perform() {
     (*encoder_ptr)->SetDtx(true);
   });
   expects[static_cast<int>(AudioFrameType::kEmptyFrame)] = 1;
+  expects[static_cast<int>(AudioFrameType::kAudioFrameSpeech)] = 1;
   expects[static_cast<int>(AudioFrameType::kAudioFrameCN)] = 1;
   Run(webrtc::test::ResourcePath("audio_coding/testfile32kHz", "pcm"), 32000, 1,
       out_filename, true, expects);
@@ -244,10 +255,12 @@ void TestOpusDtx::Perform() {
   // Register stereo Opus as send codec
   out_filename = webrtc::test::OutputPath() + "testOpusDtx_outFile_stereo.pcm";
   RegisterCodec({"opus", 48000, 2, {{"stereo", "1"}}}, absl::nullopt);
+
   acm_send_->ModifyEncoder([](std::unique_ptr<AudioEncoder>* encoder_ptr) {
     (*encoder_ptr)->SetDtx(false);
   });
   expects[static_cast<int>(AudioFrameType::kEmptyFrame)] = 0;
+  expects[static_cast<int>(AudioFrameType::kAudioFrameSpeech)] = 1;
   expects[static_cast<int>(AudioFrameType::kAudioFrameCN)] = 0;
   Run(webrtc::test::ResourcePath("audio_coding/teststereo32kHz", "pcm"), 32000,
       2, out_filename, false, expects);
@@ -257,7 +270,13 @@ void TestOpusDtx::Perform() {
   });
 
   expects[static_cast<int>(AudioFrameType::kEmptyFrame)] = 1;
+  expects[static_cast<int>(AudioFrameType::kAudioFrameSpeech)] = 1;
+  // Android and iOS behave different with respect to the number of CN frames.
+#if defined(WEBRTC_IOS) || defined(WEBRTC_ANDROID)
   expects[static_cast<int>(AudioFrameType::kAudioFrameCN)] = 1;
+#else
+  expects[static_cast<int>(AudioFrameType::kAudioFrameCN)] = 0;
+#endif
   Run(webrtc::test::ResourcePath("audio_coding/teststereo32kHz", "pcm"), 32000,
       2, out_filename, true, expects);
 }
author	Philipp Hancke <fippo@sip-communicator.org>	2020-07-16 09:47:24 +0200
committer	Commit Bot <commit-bot@chromium.org>	2020-07-16 11:37:35 +0000
commit	686a3709acfedcf0a4c798dd1c5902787c4a266b (patch)
tree	41c3108d761aa966afa2472ff04872b3f404c6dc /modules/audio_coding
parent	3592839896082c72c8e92f9ebde788ef77b88043 (diff)
download	webrtc-686a3709acfedcf0a4c798dd1c5902787c4a266b.tar.gz