Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@jmvalin.ca>2017-07-13 01:21:35 +0300
committerJean-Marc Valin <jmvalin@jmvalin.ca>2017-07-13 01:21:35 +0300
commitb90e3239b538bdcffa7243f6d5a4835b364a125b (patch)
tree1f2c6c01d65d24a7b5a370221e1fd34167be0d61
parent80492b00c21a419f5fc4f01be0ceada05b5f24c3 (diff)
Remove HMM
-rw-r--r--celt/celt.h1
-rw-r--r--src/analysis.c141
-rw-r--r--src/analysis.h13
3 files changed, 1 insertions, 154 deletions
diff --git a/celt/celt.h b/celt/celt.h
index 70175301..02f58445 100644
--- a/celt/celt.h
+++ b/celt/celt.h
@@ -59,7 +59,6 @@ typedef struct {
float noisiness;
float activity;
float music_prob;
- float vad_prob;
int bandwidth;
float activity_probability;
/* Store as Q6 char to save space. */
diff --git a/src/analysis.c b/src/analysis.c
index d0c9e021..178d018c 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -224,15 +224,12 @@ void tonality_analysis_reset(TonalityAnalysisState *tonal)
/* Clear non-reusable fields. */
char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
- tonal->music_confidence = .9f;
- tonal->speech_confidence = .1f;
}
void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
{
int pos;
int curr_lookahead;
- float psum;
float tonality_max;
float tonality_avg;
int tonality_count;
@@ -282,17 +279,7 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
/* The -1 is to compensate for the delay in the features themselves. */
curr_lookahead = IMAX(curr_lookahead-1, 0);
- psum=0;
- /* Summing the probability of transition patterns that involve music at
- time (DETECT_SIZE-curr_lookahead-1) */
- for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
- psum += tonal->pmusic[i];
- for (;i<DETECT_SIZE;i++)
- psum += tonal->pspeech[i];
- psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
/*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
-
- info_out->music_prob = psum;
}
static const float std_feature_bias[9] = {
@@ -369,12 +356,6 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
offset = 3*offset/2;
}
- if (tonal->count<4) {
- if (tonal->application == OPUS_APPLICATION_VOIP)
- tonal->music_prob = .1f;
- else
- tonal->music_prob = .625f;
- }
kfft = celt_mode->mdct.kfft[0];
if (tonal->count==0)
tonal->mem_fill = 240;
@@ -768,129 +749,9 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
/* Probability of speech or music vs noise */
info->activity_probability = frame_probs[1];
+ info->music_prob = frame_probs[0];
/*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
- {
- /* Probability of state transition */
- float tau;
- /* Represents independence of the MLP probabilities, where
- beta=1 means fully independent. */
- float beta;
- /* Denormalized probability of speech (p0) and music (p1) after update */
- float p0, p1;
- /* Probabilities for "all speech" and "all music" */
- float s0, m0;
- /* Probability sum for renormalisation */
- float psum;
- /* Instantaneous probability of speech and music, with beta pre-applied. */
- float speech0;
- float music0;
- float p, q;
-
- /* More silence transitions for speech than for music. */
- tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
- p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
- q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
- beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
- /* p0 and p1 are the probabilities of speech and music at this frame
- using only information from previous frame and applying the
- state transition model */
- p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau;
- p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
- /* We apply the current probability with exponent beta to work around
- the fact that the probability estimates aren't independent. */
- p0 *= (float)pow(1-frame_probs[1], beta);
- p1 *= (float)pow(frame_probs[1], beta);
- /* Normalise the probabilities to get the Marokv probability of music. */
- tonal->vad_prob = p1/(p0+p1);
- info->vad_prob = tonal->vad_prob;
- /* Consider that silence has a 50-50 probability of being speech or music. */
- frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;
-
- /* One transition every 3 minutes of active audio */
- tau = .0001f;
- /* Adapt beta based on how "unexpected" the new prob is */
- p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
- q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
- beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
- /* p0 and p1 are the probabilities of speech and music at this frame
- using only information from previous frame and applying the
- state transition model */
- p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
- p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
- /* We apply the current probability with exponent beta to work around
- the fact that the probability estimates aren't independent. */
- p0 *= (float)pow(1-frame_probs[0], beta);
- p1 *= (float)pow(frame_probs[0], beta);
- /* Normalise the probabilities to get the Marokv probability of music. */
- tonal->music_prob = p1/(p0+p1);
- info->music_prob = tonal->music_prob;
-
- /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/
- /* This chunk of code deals with delayed decision. */
- psum=1e-20f;
- /* Instantaneous probability of speech and music, with beta pre-applied. */
- speech0 = (float)pow(1-frame_probs[0], beta);
- music0 = (float)pow(frame_probs[0], beta);
- if (tonal->count==1)
- {
- if (tonal->application == OPUS_APPLICATION_VOIP)
- tonal->pmusic[0] = .1f;
- else
- tonal->pmusic[0] = .625f;
- tonal->pspeech[0] = 1-tonal->pmusic[0];
- }
- /* Updated probability of having only speech (s0) or only music (m0),
- before considering the new observation. */
- s0 = tonal->pspeech[0] + tonal->pspeech[1];
- m0 = tonal->pmusic [0] + tonal->pmusic [1];
- /* Updates s0 and m0 with instantaneous probability. */
- tonal->pspeech[0] = s0*(1-tau)*speech0;
- tonal->pmusic [0] = m0*(1-tau)*music0;
- /* Propagate the transition probabilities */
- for (i=1;i<DETECT_SIZE-1;i++)
- {
- tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
- tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
- }
- /* Probability that the latest frame is speech, when all the previous ones were music. */
- tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
- /* Probability that the latest frame is music, when all the previous ones were speech. */
- tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
-
- /* Renormalise probabilities to 1 */
- for (i=0;i<DETECT_SIZE;i++)
- psum += tonal->pspeech[i] + tonal->pmusic[i];
- psum = 1.f/psum;
- for (i=0;i<DETECT_SIZE;i++)
- {
- tonal->pspeech[i] *= psum;
- tonal->pmusic [i] *= psum;
- }
- psum = tonal->pmusic[0];
- for (i=1;i<DETECT_SIZE;i++)
- psum += tonal->pspeech[i];
-
- /* Estimate our confidence in the speech/music decisions */
- if (frame_probs[1]>.75)
- {
- if (tonal->music_prob>.9)
- {
- float adapt;
- adapt = 1.f/(++tonal->music_confidence_count);
- tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
- tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
- }
- if (tonal->music_prob<.1)
- {
- float adapt;
- adapt = 1.f/(++tonal->speech_confidence_count);
- tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
- tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
- }
- }
- }
- tonal->last_music = tonal->music_prob>.5f;
#ifdef MLP_TRAINING
for (i=0;i<25;i++)
printf("%f ", features[i]);
diff --git a/src/analysis.h b/src/analysis.h
index 629a32fb..289c845e 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -65,24 +65,11 @@ typedef struct {
float mem[32];
float cmean[8];
float std[9];
- float music_prob;
- float vad_prob;
float Etracker;
float lowECount;
int E_count;
- int last_music;
int count;
int analysis_offset;
- /** Probability of having speech for time i to DETECT_SIZE-1 (and music before).
- pspeech[0] is the probability that all frames in the window are speech. */
- float pspeech[DETECT_SIZE];
- /** Probability of having music for time i to DETECT_SIZE-1 (and speech before).
- pmusic[0] is the probability that all frames in the window are music. */
- float pmusic[DETECT_SIZE];
- float speech_confidence;
- float music_confidence;
- int speech_confidence_count;
- int music_confidence_count;
int write_pos;
int read_pos;
int read_subframe;