diff options
author | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2016-11-22 08:23:43 +0300 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2016-11-22 08:23:43 +0300 |
commit | aaafab328d3bf94dcfb901be6a35143475699fd5 (patch) | |
tree | f318045122e81d31097c8cb0279a3eaa1349d9b0 | |
parent | d50e80d525e8a6d81c7949fde237722f02582f04 (diff) |
Improved features for speech/music detection
-rw-r--r-- | src/analysis.c | 60 | ||||
-rw-r--r-- | src/analysis.h | 1 | ||||
-rw-r--r-- | src/mlp_data.c | 186 | ||||
-rw-r--r-- | src/mlp_train.c | 2 |
4 files changed, 147 insertions, 102 deletions
diff --git a/src/analysis.c b/src/analysis.c index a5b480ad..e4714817 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -227,6 +227,8 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt AnalysisInfo *info; float hp_ener; float tonality2[240]; + float midE[8]; + float spec_variability=0; SAVE_STACK; tonal->last_transition++; @@ -373,12 +375,24 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt frame_loudness += (float)sqrt(E+1e-10f); logE[b] = (float)log(E+1e-10f); - tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); - tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); - if (tonal->highE[b] < tonal->lowE[b]+1.f) + tonal->logE[tonal->E_count][b] = logE[b]; + if (tonal->count==0) + tonal->highE[b] = tonal->lowE[b] = logE[b]; + if (tonal->highE[b] > tonal->lowE[b] + 7.5) { - tonal->highE[b]+=.5f; - tonal->lowE[b]-=.5f; + if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b]) + tonal->highE[b] -= .01; + else + tonal->lowE[b] += .01; + } + if (logE[b] > tonal->highE[b]) + { + tonal->highE[b] = logE[b]; + tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]); + } else if (logE[b] < tonal->lowE[b]) + { + tonal->lowE[b] = logE[b]; + tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]); } relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]); @@ -412,6 +426,26 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt tonal->prev_band_tonality[b] = band_tonality[b]; } + for (i=0;i<NB_FRAMES;i++) + { + int j; + float mindist = 1e15; + for (j=0;j<NB_FRAMES;j++) + { + int k; + float dist=0; + for (k=0;k<NB_TBANDS;k++) + { + float tmp; + tmp = tonal->logE[i][k] - tonal->logE[j][k]; + dist += tmp*tmp; + } + if (j!=i) + mindist = MIN32(mindist, dist); + } + spec_variability += mindist; + } + spec_variability = sqrt(spec_variability/NB_FRAMES/NB_TBANDS); bandwidth_mask = 0; bandwidth = 0; maxE = 0; @@ -462,7 +496,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt if (tonal->count<=2) bandwidth = 20; frame_loudness = 20*(float)log10(frame_loudness); - tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); + tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness); tonal->lowECount *= (1-alphaE); if (frame_loudness < tonal->Etracker-30) tonal->lowECount += alphaE; @@ -474,6 +508,13 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt sum += dct_table[i*16+b]*logE[b]; BFCC[i] = sum; } + for (i=0;i<8;i++) + { + float sum=0; + for (b=0;b<16;b++) + sum += dct_table[i*16+b]*.5*(tonal->highE[b]+tonal->lowE[b]); + midE[i] = sum; + } frame_stationarity /= NB_TBANDS; relativeE /= NB_TBANDS; @@ -512,6 +553,8 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt for (i=0;i<9;i++) tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i]; } + for (i=0;i<4;i++) + features[i] = BFCC[i]-midE[i]; for (i=0;i<8;i++) { @@ -522,6 +565,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt } for (i=0;i<9;i++) features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i]; + features[18] = spec_variability-.78;; features[20] = info->tonality - 0.154723; features[21] = info->activity - 0.724643; features[22] = frame_stationarity - 0.743717; @@ -563,11 +607,11 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt float p, q; /* One transition every 3 minutes of active audio */ - tau = .00005f*frame_probs[1]; + tau = .0001f*frame_probs[1]; /* Adapt beta based on how "unexpected" the new prob is */ p = MAX16(.05f,MIN16(.95f,frame_probs[0])); q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); - beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); + beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); /* p0 and p1 are the probabilities of speech and music at this frame using only information from previous frame and applying the state transition model */ diff --git a/src/analysis.h b/src/analysis.h index 5ed791b1..971c8e0c 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -49,6 +49,7 @@ typedef struct { float prev_band_tonality[NB_TBANDS]; float prev_tonality; float E[NB_FRAMES][NB_TBANDS]; + float logE[NB_FRAMES][NB_TBANDS]; float lowE[NB_TBANDS]; float highE[NB_TBANDS]; float meanE[NB_TOT_BANDS+1]; diff --git a/src/mlp_data.c b/src/mlp_data.c index c217f9ec..b63d583c 100644 --- a/src/mlp_data.c +++ b/src/mlp_data.c @@ -4,104 +4,104 @@ #include "mlp.h" -/* RMS error was 0.218632, seed was 1479354367 */ -/* 0.009567 0.056814 (0.218632 0.218632) 3.0118e-07 7479 */ +/* RMS error was 0.315104, seed was 1479763182 */ +/* 0.006838 0.032708 (0.315104 0.315104) 7.21128e-09 8044 */ static const float weights[450] = { /* hidden layer */ -1.85706f, -0.127792f, 0.486934f, -0.252073f, 0.218278f, -0.182399f, -0.249612f, 0.312895f, -0.118228f, -0.123521f, -0.45384f, -0.302397f, -0.216463f, 0.0613465f, 0.193533f, -0.066651f, 0.17067f, -0.0278193f, 0.0602761f, -0.087421f, --0.0604716f, 1.27567f, -2.28812f, -0.262244f, -0.186225f, -7.69143f, 1.88698f, -0.0178079f, 0.11667f, -0.0820569f, -0.102391f, 0.0553723f, 0.00548539f, -0.0361446f, 0.0638111f, -0.116011f, 0.0785724f, 0.0892341f, 0.169258f, -0.0953813f, -0.0189002f, -0.14414f, -0.0855396f, -0.183561f, -0.261543f, --0.0122127f, 0.060642f, 0.139359f, -5.77982f, 0.218735f, -0.328613f, 1.12266f, -0.26173f, 0.0141489f, -0.0579786f, --0.00821196f, -0.0340188f, -0.00382787f, 0.00463267f, -0.0216233f, --0.0316711f, 0.0182308f, -0.044488f, -0.0283259f, 0.00538179f, -0.0673935f, -0.0699712f, 0.0938827f, 0.0688342f, 0.0472305f, -0.0358129f, -0.0300705f, -0.127208f, -1.46059f, -0.664145f, -0.584721f, -1.59867f, 0.428714f, -0.109957f, 0.0160402f, --0.165646f, -0.0639903f, -0.0223546f, -0.168096f, 0.192809f, --0.0945295f, 0.0966078f, -0.20344f, 0.21911f, -0.239588f, -0.16812f, -0.624156f, -0.327362f, -0.492687f, -0.146139f, -0.0489192f, 0.0918105f, -0.259506f, -0.255826f, -1.02741f, -1.66675f, -1.96975f, 0.5926f, -3.2493f, -0.385736f, -0.035917f, -0.0455865f, -0.00635188f, -0.0262669f, 0.0311736f, -0.0376469f, 0.106273f, 0.0272225f, 0.000841985f, -0.000870045f, -0.0936811f, -0.0535512f, -0.00199683f, -0.175249f, -0.188772f, -0.116839f, 0.114066f, 0.241866f, 0.318693f, -0.00831369f, --0.459869f, -1.67089f, -0.0348707f, 4.08708f, 0.185344f, -1.65596f, -0.0856278f, -0.00970444f, -0.0751112f, -0.0465534f, --0.083325f, -0.0274241f, -0.106578f, -0.062097f, -0.132031f, --0.0280223f, -0.162105f, 0.0568372f, 0.0238911f, 0.130229f, -0.193513f, -0.00905597f, -0.114584f, -0.290716f, -0.270654f, -0.0116111f, -0.471526f, 0.873425f, 0.256522f, 0.236046f, --0.337619f, -0.43148f, -0.00185362f, 0.0879064f, 0.213956f, -0.109458f, 0.0266622f, -0.0153787f, 0.121687f, -0.0428293f, --0.0179952f, 0.0256446f, 0.244823f, -0.0183688f, 0.0673453f, --0.261959f, -0.0195782f, -0.141111f, 0.0401895f, 0.425363f, -0.201468f, -0.167032f, -0.135661f, 9.12063f, -0.175319f, -4.47634f, 0.303674f, -4.15526f, -0.0474631f, 0.00855777f, -0.0944216f, 0.0691601f, -0.0445577f, -0.091551f, -0.0982992f, --0.131612f, -0.0969343f, -0.124359f, -0.0123872f, 0.0174908f, -0.465926f, -0.083936f, 0.824247f, 0.0917478f, -0.0434931f, --0.125792f, -0.0484324f, 0.0246675f, -0.516806f, 3.3132f, -3.73923f, 2.62617f, -93.2532f, -0.605171f, 0.0422526f, -0.00270449f, 0.0662168f, 0.0259782f, 0.0140055f, 0.0174977f, --0.0478789f, -0.0162726f, 0.0174899f, -0.0384183f, 0.0458608f, -0.0954459f, 0.242452f, 0.21106f, 0.280241f, -0.088756f, --0.188288f, -0.572681f, -0.561314f, -0.0891808f, 0.401866f, --0.63524f, 0.902111f, 0.68859f, 0.856107f, -1.42781f, -0.0738099f, 0.0721273f, 0.102551f, 0.0874915f, -0.0598193f, --0.172467f, -0.0121237f, -0.0276566f, -0.146163f, -0.264742f, -0.0154283f, -0.0601237f, 0.129874f, 0.0970165f, 0.150932f, -0.0938906f, -0.149264f, 0.014297f, -0.177554f, -0.126669f, -0.717065f, -1.862f, 2.25998f, -1.75112f, -1.15153f, --0.171174f, 0.0613752f, -0.0453666f, 0.0906195f, 0.0235879f, -0.0266649f, -0.00258569f, 0.0013994f, -0.00458529f, 0.0285861f, -0.070281f, 0.0474546f, -0.0312625f, -0.044836f, -0.0677274f, --0.0571132f, 0.106835f, 0.145102f, 0.063436f, 0.174177f, -0.0400255f, -0.960015f, 0.666379f, -1.81109f, -1.12481f, --3.22379f, -0.617688f, -0.0196257f, 0.312598f, 0.151316f, -0.105433f, 0.00285736f, 0.244391f, 0.188321f, 0.146927f, --0.0991143f, 0.331209f, 0.174657f, 0.0566923f, 0.218108f, -0.278288f, 0.169498f, 0.088236f, -0.318505f, 0.0934496f, --0.130456f, 0.033616f, -0.137256f, 5.0233f, 0.807433f, --1.80039f, -1.20691f, 0.954173f, -0.0258414f, 0.0275836f, -0.0420577f, 0.0260727f, -0.0386572f, -0.0126257f, -0.0199216f, --0.00275359f, -0.0279507f, 0.0336754f, 0.0200924f, -0.0275905f, --0.134484f, -0.0178773f, -0.048596f, 0.140668f, -0.0240282f, -0.133156f, -0.0154105f, -0.0307081f, -1.1112f, 1.25178f, --0.400513f, -4.99124f, 0.133491f, 1.39644f, -0.208202f, --0.115869f, -0.184513f, -0.065846f, -0.0823625f, -0.0714934f, --0.0366108f, -0.0259024f, -0.199396f, -0.0876733f, -0.0860402f, -0.111153f, 0.0771431f, 0.230413f, 0.478924f, 0.0588229f, --0.0106344f, -0.0371012f, 0.0713401f, 0.194239f, 9.53558f, -1.23366f, 0.155603f, 4.49677f, 0.743929f, -1.01867f, -0.045674f, 0.00230101f, -0.0111697f, -0.00849812f, 0.0344646f, --0.0262655f, 0.0344082f, 0.00702128f, 0.0498685f, 0.017916f, --0.0422025f, -0.029116f, -0.186595f, -0.0139807f, -0.352806f, --0.070128f, 0.079064f, -0.00257802f, -0.325687f, 0.087853f, -0.536298f, -1.39807f, -0.76403f, -0.378262f, -0.0528369f, --0.0874338f, -0.00700915f, 0.103583f, 0.0333754f, 0.0188563f, --0.00467275f, 0.0450362f, 0.0263766f, -0.0648204f, -0.0490503f, -0.0580142f, -0.0242903f, -0.0187162f, -0.0705934f, 0.274156f, -0.150911f, 0.0853199f, -0.00532834f, -0.126533f, -0.0996278f, -0.0570326f, 1.70936f, 3.01576f, -1.36414f, -4.93744f, --2.24259f, +1.16791f, 0.0117457f, -0.173725f, 0.00088526f, -0.182404f, +-0.0160565f, -0.0683622f, -0.101919f, -0.0460863f, 0.00236859f, +0.0214204f, -0.0522124f, -0.00439659f, 0.126548f, -0.0747379f, +0.00759737f, -0.0465382f, 0.0282859f, -0.00823783f, 0.49046f, +-0.0038811f, 4.67584f, -1.75408f, 0.189231f, 6.88353f, +-0.169735f, -0.302665f, 0.0344662f, 0.0376251f, 0.0824736f, +-0.00808218f, -0.00840575f, -0.0134915f, -0.0852928f, -0.0557674f, +-0.0524697f, -0.0267727f, -0.0711738f, 0.168746f, 0.345237f, +0.114282f, 0.0130365f, 0.0224121f, -0.124843f, -0.119323f, +-0.147682f, -0.149418f, 1.44711f, -0.0385809f, 1.64565f, +1.702f, 4.94977f, 1.1675f, -0.00709793f, -0.028046f, +0.0130692f, 0.0110228f, 0.00442845f, 0.0270771f, -0.012915f, +-0.0205496f, 0.00731574f, 0.0290182f, -0.0137988f, -0.0496688f, +0.0497328f, -0.0500723f, 0.0693743f, 0.125171f, 0.002266f, +0.0820088f, 0.0801653f, -0.0815278f, -0.312179f, 0.499351f, +0.0146673f, -0.0729864f, -3.07368f, -1.12587f, 0.0807415f, +0.0317455f, 0.0629169f, 0.0489931f, -0.0143552f, -0.0121456f, +-0.000922163f, -0.0195092f, -0.0354053f, -0.0316398f, -0.0409961f, +-0.0762715f, -0.119086f, -0.0515177f, -0.286433f, -0.0256642f, +0.00490787f, 0.089922f, 0.272454f, -0.00799747f, -3.92076f, +-0.923539f, -0.344524f, -1.1584f, -0.232077f, -2.54335f, +-0.0289305f, 0.180725f, -0.0541124f, 0.113015f, 0.0614053f, +-0.194218f, 0.126639f, -0.0941479f, -0.0399991f, 0.0308558f, +-0.097045f, -0.00974284f, -0.234078f, -0.117714f, -0.742824f, +-0.0562217f, 0.225729f, -0.0762788f, -1.06489f, 0.124149f, +0.361664f, -0.174717f, -0.413253f, 1.55378f, 0.635495f, +0.831817f, 0.00214014f, 0.0301272f, 0.0165128f, -0.00468816f, +-0.00518814f, -0.00861749f, -0.00566215f, -0.00880481f, -0.00152252f, +-0.0150631f, -0.0182496f, -0.0161498f, -0.0868953f, 0.0632226f, +-0.0385318f, 0.0151289f, 0.0268427f, -0.00495099f, 0.16565f, +0.0273735f, 0.716388f, 1.38317f, -0.0482318f, -0.817091f, +-0.823276f, 0.126554f, 0.008395f, -0.0788533f, -0.0305483f, +-0.0141837f, 0.0287401f, 0.0220461f, 0.0798372f, -0.0162728f, +-0.034366f, 0.0578138f, -0.027475f, -0.369605f, -0.690334f, +-0.287187f, -0.174793f, 0.719982f, 0.792722f, 1.08376f, +3.42282f, -0.0205459f, -1.11499f, -0.23692f, 5.31937f, +0.338354f, 2.88862f, -0.0488146f, -0.0693933f, -0.0525298f, +-0.0691915f, -0.0748021f, -0.0479683f, 0.0557816f, -0.0234204f, +0.0711225f, -0.0284554f, 0.0748894f, 0.0312238f, 0.0430777f, +-0.149758f, -0.0643999f, -0.328943f, 0.0220431f, -0.00670375f, +-0.150891f, 0.0826483f, -0.0416984f, 0.91942f, 0.288807f, +-0.784006f, -0.316274f, -0.914043f, 0.847168f, -0.0511541f, +0.0591144f, 0.0444577f, -0.0523042f, 0.0435139f, -0.0368748f, +-0.0238474f, 0.0578224f, 0.033423f, -0.00959278f, 0.000368111f, +-0.033063f, 0.0498318f, -0.193449f, 0.342364f, 0.0248039f, +0.106036f, -0.105387f, 0.660923f, -0.0940084f, 0.649895f, +-0.879327f, 0.567143f, 1.64079f, 4.26012f, -0.891701f, +0.0566126f, 0.0565989f, 0.126332f, 0.0622828f, 0.00303243f, +-0.0209919f, -0.0316721f, -0.00332618f, -0.0104709f, -0.0439127f, +-0.052425f, -0.0328074f, -0.000365187f, -0.151337f, -0.0136578f, +0.00736587f, -0.0245835f, 0.104102f, 0.246421f, -0.0320614f, +-2.13688f, -0.644779f, -0.587536f, -0.0876224f, -0.845826f, +-0.675112f, 0.00497933f, -0.0138469f, 0.0478949f, -0.0193546f, +0.0743611f, 0.0919097f, -0.021396f, -0.00987072f, 0.0906004f, +0.15022f, -0.0719682f, -0.123023f, -0.032162f, -0.226081f, +0.0112323f, 0.143681f, 0.0963748f, 0.388256f, 0.499536f, +-0.0717579f, 0.245617f, -0.541487f, 0.495578f, 2.30802f, +0.995929f, -0.829615f, -0.00364842f, 0.0295008f, -0.00799017f, +0.0072746f, -0.063231f, -0.129824f, -0.020175f, -0.0221331f, +-0.121943f, -0.138491f, -0.0277175f, 0.0259129f, -0.113353f, +0.0901211f, -0.0515522f, 0.00156965f, 0.0223951f, -0.0542028f, +-0.198752f, -0.00318196f, -0.711666f, 1.09381f, 0.000768032f, +-2.31925f, -0.019048f, -1.17297f, -0.0540454f, -0.050889f, +-0.00426479f, 0.185899f, 0.0558439f, 0.123861f, -0.00353509f, +-0.178127f, 0.0683067f, -0.00891811f, 0.0577757f, 0.12994f, +0.298556f, -0.0244329f, -0.455871f, 0.106323f, -0.268203f, +0.034243f, -0.42948f, -0.228146f, -0.421731f, -1.2974f, +2.90981f, 1.83682f, 1.22827f, 0.646216f, 0.00618927f, +-0.163023f, -0.0588448f, -0.0680873f, 0.00265286f, 0.15079f, +0.0180199f, 0.0521558f, -0.0251215f, 0.211758f, 0.000908394f, +-0.0665474f, -0.179624f, 0.0223256f, -0.258492f, 0.0329357f, +0.0776958f, -0.0431089f, 0.627821f, 0.0300259f, -1.08477f, +1.59281f, 0.523867f, -3.2745f, -1.30564f, -0.228395f, +-0.0872362f, 0.0218742f, -0.0650792f, -0.0307158f, 0.0112539f, +-0.0289815f, 0.00459511f, 0.00851279f, -0.0055035f, 0.032939f, +0.00853459f, -0.0193472f, 0.0590096f, 0.323122f, 0.0439625f, +0.0102308f, 0.103701f, -0.389139f, 0.629254f, 0.0838598f, +1.43368f, -0.658163f, 0.0295287f, -0.46143f, 3.01521f, +-0.749628f, -0.0302256f, -0.106793f, 0.0680327f, -0.0972569f, +0.0024169f, 0.0499889f, -0.0418845f, 0.0831352f, -0.0491967f, +0.0701002f, -0.0329629f, -0.0539425f, 0.151929f, 0.187274f, +0.29397f, 0.132495f, -0.0677414f, -0.0609771f, -0.475259f, +0.0238476f, -1.7368f, -0.280829f, -0.0644994f, 0.342657f, +2.05706f, /* output layer */ -0.281193, -1.0361, -0.554712, -3.32969, 0.497616, -1.1356, 0.911487, 0.591915, 0.785052, 4.22215, --1.53397, -1.78146, -0.726456, 4.06597, 0.914172, -2.84081, 1.2306, 5.29403, -0.126691, -1.22636, --0.71336, 0.295882, -1.30839, -3.25103, -0.680867, -0.0129163, 0.0272871, 1.15775, 0.932324, 1.22855, --0.210346, 1.74899, -0.945376, 0.626078, }; +1.33932, 1.70279, -1.27695, -4.52246, 0.0740156, +1.71598, 8.63902, -1.26394, 1.60628, -1.2561, +1.62678, -1.8133, -3.03604, 1.71492, 0.531547, +-1.01656, 1.71594, 6.91163, -0.566851, 0.438947, +0.0745278, -7.43604, -0.0317548, 0.585059, -0.646684, +-0.632673, -5.15384, 8.68134, -0.0264247, 1.16378, +-0.232851, 1.29058, 2.1686, -0.757127, }; static const int topo[3] = {25, 16, 2}; diff --git a/src/mlp_train.c b/src/mlp_train.c index 0bf22206..cc76a009 100644 --- a/src/mlp_train.c +++ b/src/mlp_train.c @@ -159,7 +159,7 @@ double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamp error[i] = out[i] - netOut[i]; if (out[i] == 0) error[i] *= .0; error_rate[i] += fabs(error[i])>1; - if (i==0) error[i] *= 3; + if (i==0) error[i] *= 5; //if (i==1 && out[i] < 0) error[i] *= 2.25; rms += error[i]*error[i]; /*error[i] = error[i]/(1+fabs(error[i]));*/ |