diff options
author | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2017-07-13 01:21:35 +0300 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2017-07-13 01:21:35 +0300 |
commit | b90e3239b538bdcffa7243f6d5a4835b364a125b (patch) | |
tree | 1f2c6c01d65d24a7b5a370221e1fd34167be0d61 | |
parent | 80492b00c21a419f5fc4f01be0ceada05b5f24c3 (diff) |
Remove HMM
-rw-r--r-- | celt/celt.h | 1 | ||||
-rw-r--r-- | src/analysis.c | 141 | ||||
-rw-r--r-- | src/analysis.h | 13 |
3 files changed, 1 insertions, 154 deletions
diff --git a/celt/celt.h b/celt/celt.h index 70175301..02f58445 100644 --- a/celt/celt.h +++ b/celt/celt.h @@ -59,7 +59,6 @@ typedef struct { float noisiness; float activity; float music_prob; - float vad_prob; int bandwidth; float activity_probability; /* Store as Q6 char to save space. */ diff --git a/src/analysis.c b/src/analysis.c index d0c9e021..178d018c 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -224,15 +224,12 @@ void tonality_analysis_reset(TonalityAnalysisState *tonal) /* Clear non-reusable fields. */ char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); - tonal->music_confidence = .9f; - tonal->speech_confidence = .1f; } void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) { int pos; int curr_lookahead; - float psum; float tonality_max; float tonality_avg; int tonality_count; @@ -282,17 +279,7 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int /* The -1 is to compensate for the delay in the features themselves. */ curr_lookahead = IMAX(curr_lookahead-1, 0); - psum=0; - /* Summing the probability of transition patterns that involve music at - time (DETECT_SIZE-curr_lookahead-1) */ - for (i=0;i<DETECT_SIZE-curr_lookahead;i++) - psum += tonal->pmusic[i]; - for (;i<DETECT_SIZE;i++) - psum += tonal->pspeech[i]; - psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/ - - info_out->music_prob = psum; } static const float std_feature_bias[9] = { @@ -369,12 +356,6 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt offset = 3*offset/2; } - if (tonal->count<4) { - if (tonal->application == OPUS_APPLICATION_VOIP) - tonal->music_prob = .1f; - else - tonal->music_prob = .625f; - } kfft = celt_mode->mdct.kfft[0]; if (tonal->count==0) tonal->mem_fill = 240; @@ -768,129 +749,9 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt /* Probability of speech or music vs noise */ info->activity_probability = frame_probs[1]; + info->music_prob = frame_probs[0]; /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/ - { - /* Probability of state transition */ - float tau; - /* Represents independence of the MLP probabilities, where - beta=1 means fully independent. */ - float beta; - /* Denormalized probability of speech (p0) and music (p1) after update */ - float p0, p1; - /* Probabilities for "all speech" and "all music" */ - float s0, m0; - /* Probability sum for renormalisation */ - float psum; - /* Instantaneous probability of speech and music, with beta pre-applied. */ - float speech0; - float music0; - float p, q; - - /* More silence transitions for speech than for music. */ - tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob); - p = MAX16(.05f,MIN16(.95f,frame_probs[1])); - q = MAX16(.05f,MIN16(.95f,tonal->vad_prob)); - beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); - /* p0 and p1 are the probabilities of speech and music at this frame - using only information from previous frame and applying the - state transition model */ - p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau; - p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau; - /* We apply the current probability with exponent beta to work around - the fact that the probability estimates aren't independent. */ - p0 *= (float)pow(1-frame_probs[1], beta); - p1 *= (float)pow(frame_probs[1], beta); - /* Normalise the probabilities to get the Marokv probability of music. */ - tonal->vad_prob = p1/(p0+p1); - info->vad_prob = tonal->vad_prob; - /* Consider that silence has a 50-50 probability of being speech or music. */ - frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f; - - /* One transition every 3 minutes of active audio */ - tau = .0001f; - /* Adapt beta based on how "unexpected" the new prob is */ - p = MAX16(.05f,MIN16(.95f,frame_probs[0])); - q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); - beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); - /* p0 and p1 are the probabilities of speech and music at this frame - using only information from previous frame and applying the - state transition model */ - p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; - p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; - /* We apply the current probability with exponent beta to work around - the fact that the probability estimates aren't independent. */ - p0 *= (float)pow(1-frame_probs[0], beta); - p1 *= (float)pow(frame_probs[0], beta); - /* Normalise the probabilities to get the Marokv probability of music. */ - tonal->music_prob = p1/(p0+p1); - info->music_prob = tonal->music_prob; - - /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/ - /* This chunk of code deals with delayed decision. */ - psum=1e-20f; - /* Instantaneous probability of speech and music, with beta pre-applied. */ - speech0 = (float)pow(1-frame_probs[0], beta); - music0 = (float)pow(frame_probs[0], beta); - if (tonal->count==1) - { - if (tonal->application == OPUS_APPLICATION_VOIP) - tonal->pmusic[0] = .1f; - else - tonal->pmusic[0] = .625f; - tonal->pspeech[0] = 1-tonal->pmusic[0]; - } - /* Updated probability of having only speech (s0) or only music (m0), - before considering the new observation. */ - s0 = tonal->pspeech[0] + tonal->pspeech[1]; - m0 = tonal->pmusic [0] + tonal->pmusic [1]; - /* Updates s0 and m0 with instantaneous probability. */ - tonal->pspeech[0] = s0*(1-tau)*speech0; - tonal->pmusic [0] = m0*(1-tau)*music0; - /* Propagate the transition probabilities */ - for (i=1;i<DETECT_SIZE-1;i++) - { - tonal->pspeech[i] = tonal->pspeech[i+1]*speech0; - tonal->pmusic [i] = tonal->pmusic [i+1]*music0; - } - /* Probability that the latest frame is speech, when all the previous ones were music. */ - tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0; - /* Probability that the latest frame is music, when all the previous ones were speech. */ - tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0; - - /* Renormalise probabilities to 1 */ - for (i=0;i<DETECT_SIZE;i++) - psum += tonal->pspeech[i] + tonal->pmusic[i]; - psum = 1.f/psum; - for (i=0;i<DETECT_SIZE;i++) - { - tonal->pspeech[i] *= psum; - tonal->pmusic [i] *= psum; - } - psum = tonal->pmusic[0]; - for (i=1;i<DETECT_SIZE;i++) - psum += tonal->pspeech[i]; - - /* Estimate our confidence in the speech/music decisions */ - if (frame_probs[1]>.75) - { - if (tonal->music_prob>.9) - { - float adapt; - adapt = 1.f/(++tonal->music_confidence_count); - tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500); - tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence); - } - if (tonal->music_prob<.1) - { - float adapt; - adapt = 1.f/(++tonal->speech_confidence_count); - tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500); - tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence); - } - } - } - tonal->last_music = tonal->music_prob>.5f; #ifdef MLP_TRAINING for (i=0;i<25;i++) printf("%f ", features[i]); diff --git a/src/analysis.h b/src/analysis.h index 629a32fb..289c845e 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -65,24 +65,11 @@ typedef struct { float mem[32]; float cmean[8]; float std[9]; - float music_prob; - float vad_prob; float Etracker; float lowECount; int E_count; - int last_music; int count; int analysis_offset; - /** Probability of having speech for time i to DETECT_SIZE-1 (and music before). - pspeech[0] is the probability that all frames in the window are speech. */ - float pspeech[DETECT_SIZE]; - /** Probability of having music for time i to DETECT_SIZE-1 (and speech before). - pmusic[0] is the probability that all frames in the window are music. */ - float pmusic[DETECT_SIZE]; - float speech_confidence; - float music_confidence; - int speech_confidence_count; - int music_confidence_count; int write_pos; int read_pos; int read_subframe; |