From 42f43db7e470dec8c7a40c86180d6a07241c3577 Mon Sep 17 00:00:00 2001 From: Gustaf Ullberg Date: Tue, 10 Apr 2018 13:37:49 +0200 Subject: Silk makes use of Opus VAD Signed-off-by: Jean-Marc Valin --- silk/API.h | 3 ++- silk/define.h | 5 +++++ silk/enc_API.c | 7 ++++--- silk/fixed/encode_frame_FIX.c | 11 +++++++++-- silk/fixed/main_FIX.h | 3 ++- silk/float/encode_frame_FLP.c | 11 +++++++++-- silk/float/main_FLP.h | 3 ++- 7 files changed, 33 insertions(+), 10 deletions(-) (limited to 'silk') diff --git a/silk/API.h b/silk/API.h index 0131acbb..4d90ff9a 100644 --- a/silk/API.h +++ b/silk/API.h @@ -80,7 +80,8 @@ opus_int silk_Encode( /* O Returns error co opus_int nSamplesIn, /* I Number of samples in input vector */ ec_enc *psRangeEnc, /* I/O Compressor data structure */ opus_int32 *nBytesOut, /* I/O Number of bytes in payload (input: Max bytes) */ - const opus_int prefillFlag /* I Flag to indicate prefilling buffers no coding */ + const opus_int prefillFlag, /* I Flag to indicate prefilling buffers no coding */ + int activity /* I Decision of Opus voice activity detector */ ); /****************************************/ diff --git a/silk/define.h b/silk/define.h index 1286048e..22fd720b 100644 --- a/silk/define.h +++ b/silk/define.h @@ -58,6 +58,11 @@ extern "C" #define MAX_CONSECUTIVE_DTX 20 /* eq 400 ms */ #define DTX_ACTIVITY_THRESHOLD 0.1f +/* VAD decision */ +#define VAD_NO_DECISION -1 +#define VAD_NO_ACTIVITY 0 +#define VAD_ACTIVITY 1 + /* Maximum sampling frequency */ #define MAX_FS_KHZ 16 #define MAX_API_FS_KHZ 48 diff --git a/silk/enc_API.c b/silk/enc_API.c index 10adf2c3..7ae31a9e 100644 --- a/silk/enc_API.c +++ b/silk/enc_API.c @@ -144,7 +144,8 @@ opus_int silk_Encode( /* O Returns error co opus_int nSamplesIn, /* I Number of samples in input vector */ ec_enc *psRangeEnc, /* I/O Compressor data structure */ opus_int32 *nBytesOut, /* I/O Number of bytes in payload (input: Max bytes) */ - const opus_int prefillFlag /* I Flag to indicate prefilling buffers no coding */ + const opus_int prefillFlag, /* I Flag to indicate prefilling buffers no coding */ + opus_int activity /* I Decision of Opus voice activity detector */ ) { opus_int n, i, nBits, flags, tmp_payloadSize_ms = 0, tmp_complexity = 0, ret = 0; @@ -425,7 +426,7 @@ opus_int silk_Encode( /* O Returns error co psEnc->state_Fxx[ 1 ].sCmn.sNSQ.prev_gain_Q16 = 65536; psEnc->state_Fxx[ 1 ].sCmn.first_frame_after_reset = 1; } - silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 1 ] ); + silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 1 ], activity ); } else { psEnc->state_Fxx[ 1 ].sCmn.VAD_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] = 0; } @@ -440,7 +441,7 @@ opus_int silk_Encode( /* O Returns error co silk_memcpy( psEnc->state_Fxx[ 0 ].sCmn.inputBuf, psEnc->sStereo.sMid, 2 * sizeof( opus_int16 ) ); silk_memcpy( psEnc->sStereo.sMid, &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.frame_length ], 2 * sizeof( opus_int16 ) ); } - silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 0 ] ); + silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 0 ], activity ); /* Encode */ for( n = 0; n < encControl->nChannelsInternal; n++ ) { diff --git a/silk/fixed/encode_frame_FIX.c b/silk/fixed/encode_frame_FIX.c index d49a0fe3..a02bf87d 100644 --- a/silk/fixed/encode_frame_FIX.c +++ b/silk/fixed/encode_frame_FIX.c @@ -43,18 +43,25 @@ static OPUS_INLINE void silk_LBRR_encode_FIX( ); void silk_encode_do_VAD_FIX( - silk_encoder_state_FIX *psEnc /* I/O Pointer to Silk FIX encoder state */ + silk_encoder_state_FIX *psEnc, /* I/O Pointer to Silk FIX encoder state */ + opus_int activity /* I Decision of Opus voice activity detector */ ) { + const opus_int activity_threshold = SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ); + /****************************/ /* Voice Activity Detection */ /****************************/ silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch ); + /* If Opus VAD is inactive and Silk VAD is active: lower Silk VAD to just under the threshold */ + if( activity == VAD_NO_ACTIVITY && psEnc->sCmn.speech_activity_Q8 >= activity_threshold ) { + psEnc->sCmn.speech_activity_Q8 = activity_threshold - 1; + } /**************************************************/ /* Convert speech activity into VAD and DTX flags */ /**************************************************/ - if( psEnc->sCmn.speech_activity_Q8 < SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ) ) { + if( psEnc->sCmn.speech_activity_Q8 < activity_threshold ) { psEnc->sCmn.indices.signalType = TYPE_NO_VOICE_ACTIVITY; psEnc->sCmn.noSpeechCounter++; if( psEnc->sCmn.noSpeechCounter <= NB_SPEECH_FRAMES_BEFORE_DTX ) { diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h index 780afa39..6d2112e5 100644 --- a/silk/fixed/main_FIX.h +++ b/silk/fixed/main_FIX.h @@ -66,7 +66,8 @@ void silk_HP_variable_cutoff( /* Encoder main function */ void silk_encode_do_VAD_FIX( - silk_encoder_state_FIX *psEnc /* I/O Pointer to Silk FIX encoder state */ + silk_encoder_state_FIX *psEnc, /* I/O Pointer to Silk FIX encoder state */ + opus_int activity /* I Decision of Opus voice activity detector */ ); /* Encoder main function */ diff --git a/silk/float/encode_frame_FLP.c b/silk/float/encode_frame_FLP.c index 1d4d8a77..b029c3f5 100644 --- a/silk/float/encode_frame_FLP.c +++ b/silk/float/encode_frame_FLP.c @@ -42,18 +42,25 @@ static OPUS_INLINE void silk_LBRR_encode_FLP( ); void silk_encode_do_VAD_FLP( - silk_encoder_state_FLP *psEnc /* I/O Encoder state FLP */ + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + opus_int activity /* I Decision of Opus voice activity detector */ ) { + const opus_int activity_threshold = SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ); + /****************************/ /* Voice Activity Detection */ /****************************/ silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch ); + /* If Opus VAD is inactive and Silk VAD is active: lower Silk VAD to just under the threshold */ + if( activity == VAD_NO_ACTIVITY && psEnc->sCmn.speech_activity_Q8 >= activity_threshold ) { + psEnc->sCmn.speech_activity_Q8 = activity_threshold - 1; + } /**************************************************/ /* Convert speech activity into VAD and DTX flags */ /**************************************************/ - if( psEnc->sCmn.speech_activity_Q8 < SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ) ) { + if( psEnc->sCmn.speech_activity_Q8 < activity_threshold ) { psEnc->sCmn.indices.signalType = TYPE_NO_VOICE_ACTIVITY; psEnc->sCmn.noSpeechCounter++; if( psEnc->sCmn.noSpeechCounter <= NB_SPEECH_FRAMES_BEFORE_DTX ) { diff --git a/silk/float/main_FLP.h b/silk/float/main_FLP.h index f47fc93b..5dc0ccf4 100644 --- a/silk/float/main_FLP.h +++ b/silk/float/main_FLP.h @@ -56,7 +56,8 @@ void silk_HP_variable_cutoff( /* Encoder main function */ void silk_encode_do_VAD_FLP( - silk_encoder_state_FLP *psEnc /* I/O Encoder state FLP */ + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + opus_int activity /* I Decision of Opus voice activity detector */ ); /* Encoder main function */ -- cgit v1.2.3