Add delayed decision (doesn't quite work yet)

author: Jean-Marc Valin <jmvalin@jmvalin.ca> 2017-07-13 10:03:48 +0300
committer: Jean-Marc Valin <jmvalin@jmvalin.ca> 2017-07-13 10:03:48 +0300
commit: d9a92a78f84d6e4f903062f430325b77a72afc03 (patch)
tree: 56e2ab36b746997449c0dcc64c17d1821c714e9c
parent: b90e3239b538bdcffa7243f6d5a4835b364a125b (diff)
3 files changed, 52 insertions, 1 deletions
diff --git a/celt/celt.h b/celt/celt.h
index 02f58445..f73f29dd 100644
--- a/celt/celt.h
+++ b/celt/celt.h
@@ -59,6 +59,8 @@ typedef struct {
    float noisiness;
    float activity;
    float music_prob;
+   float music_prob_min;
+   float music_prob_max;
    int   bandwidth;
    float activity_probability;
    /* Store as Q6 char to save space. */
diff --git a/src/analysis.c b/src/analysis.c
index 178d018c..f501826f 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -50,6 +50,8 @@
 
 #ifndef DISABLE_FLOAT_API
 
+#define TRANSITION_PENALTY 5
+
 static const float dct_table[128] = {
         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
@@ -234,6 +236,11 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
    float tonality_avg;
    int tonality_count;
    int i;
+   int pos0;
+   float prob_avg;
+   float prob_count;
+   float prob_min, prob_max;
+   float vad_prob;
 
    pos = tonal->read_pos;
    curr_lookahead = tonal->write_pos-tonal->read_pos;
@@ -251,6 +258,7 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
       pos--;
    if (pos<0)
       pos = DETECT_SIZE-1;
+   pos0 = pos;
    OPUS_COPY(info_out, &tonal->info[pos], 1);
    tonality_max = tonality_avg = info_out->tonality;
    tonality_count = 1;
@@ -267,6 +275,41 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
       tonality_count++;
    }
    info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
+
+   pos = pos0;
+   /* If we have enough look-ahead, discard the first 5 frames to compensate for the
+      delay in the features. */
+   if (curr_lookahead > 15)
+   {
+      pos += 5;
+      if (pos>=DETECT_SIZE)
+         pos -= DETECT_SIZE;
+   }
+
+   info_out->music_prob = tonal->info[pos].music_prob;
+   prob_min = prob_max = prob_avg = tonal->info[pos].music_prob;
+   vad_prob = tonal->info[pos].activity_probability;
+   prob_count = MAX16(.1, vad_prob);
+   while (1)
+   {
+      float pos_vad;
+      pos++;
+      if (pos==DETECT_SIZE)
+         pos = 0;
+      if (pos == tonal->write_pos)
+         break;
+      pos_vad = tonal->info[pos].activity_probability;
+      prob_count += MAX16(.1, pos_vad);
+      prob_avg += (tonal->info[pos].music_prob-prob_avg)/prob_count;
+      prob_min = MIN16(prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad)/prob_count, prob_min);
+      prob_max = MAX16(prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad)/prob_count, prob_max);
+   }
+   prob_min = MAX16(prob_min, 0);
+   prob_max = MIN16(prob_max, 1);
+   info_out->music_prob_min = prob_min;
+   info_out->music_prob_max = prob_max;
+
+   /*printf("%f %f %f\n", prob_min, prob_max, prob_count);*/
    tonal->read_subframe += len/(tonal->Fs/400);
    while (tonal->read_subframe>=8)
    {
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index cd37fcdf..c2866d38 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -1366,7 +1366,12 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        mode_music = (opus_int32)(MULT16_32_Q15(Q15ONE-stereo_width,mode_thresholds[1][1])
              + MULT16_32_Q15(stereo_width,mode_thresholds[1][1]));
        /* Interpolate based on speech/music probability */
-       threshold = mode_music + ((voice_est*voice_est*(mode_voice-mode_music))>>14);
+       if (analysis_info.valid)
+       {
+          float prob = (st->prev_mode == MODE_CELT_ONLY) ? analysis_info.music_prob_max : analysis_info.music_prob_min;
+          threshold = prob*mode_music + (1-prob)*mode_voice;
+       } else
+          threshold = mode_music + ((voice_est*voice_est*(mode_voice-mode_music))>>14);
        /* Bias towards SILK for VoIP because of some useful features */
        if (st->application == OPUS_APPLICATION_VOIP)
           threshold += 8000;
@@ -1378,6 +1383,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
        else if (st->prev_mode>0)
            threshold += 4000;
 
+       /*printf("%d\n", (equiv_rate >= threshold));*/
        st->mode = (equiv_rate >= threshold) ? MODE_CELT_ONLY: MODE_SILK_ONLY;
 
        /* When FEC is enabled and there's enough packet loss, use SILK */
author	Jean-Marc Valin <jmvalin@jmvalin.ca>	2017-07-13 10:03:48 +0300
committer	Jean-Marc Valin <jmvalin@jmvalin.ca>	2017-07-13 10:03:48 +0300
commit	d9a92a78f84d6e4f903062f430325b77a72afc03 (patch)
tree	56e2ab36b746997449c0dcc64c17d1821c714e9c
parent	b90e3239b538bdcffa7243f6d5a4835b364a125b (diff)