avfilter/af_speechnorm: implement rms option

author: Paul B Mahol <onemda@gmail.com> 2022-11-06 16:03:27 +0300
committer: Paul B Mahol <onemda@gmail.com> 2022-11-07 10:32:48 +0300
commit: 7027101904a483e1c6a6db89b47b3362c37155a0 (patch)
tree: 5ab15a46a604629cf2417fb638cc773d1b2fb357
parent: b48d2320f118f8a687a7e65f57adecbe3479e7a0 (diff)
2 files changed, 31 insertions, 6 deletions
diff --git a/doc/filters.texi b/doc/filters.texi
index b50582eb2a..18b14fc376 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -6338,6 +6338,10 @@ option. When enabled any half-cycle of samples with their local peak value below
 Link channels when calculating gain applied to each filtered channel sample, by default is disabled.
 When disabled each filtered channel gain calculation is independent, otherwise when this option
 is enabled the minimum of all possible gains for each filtered channel is used.
+
+@item rms, m
+Set the expansion target RMS value. This specifies the highest allowed RMS level for the normalized
+audio input. Default value is 0.0, thus disabled. Allowed range is from 0.0 to 1.0.
 @end table
 
 @subsection Commands
diff --git a/libavfilter/af_speechnorm.c b/libavfilter/af_speechnorm.c
index c9bd8d5cac..fd6b7d9a32 100644
--- a/libavfilter/af_speechnorm.c
+++ b/libavfilter/af_speechnorm.c
@@ -46,6 +46,7 @@ typedef struct PeriodItem {
     int size;
     int type;
     double max_peak;
+    double rms_sum;
 } PeriodItem;
 
 typedef struct ChannelContext {
@@ -54,6 +55,7 @@ typedef struct ChannelContext {
     PeriodItem pi[MAX_ITEMS];
     double gain_state;
     double pi_max_peak;
+    double pi_rms_sum;
     int pi_start;
     int pi_end;
     int pi_size;
@@ -62,6 +64,7 @@ typedef struct ChannelContext {
 typedef struct SpeechNormalizerContext {
     const AVClass *class;
 
+    double rms_value;
     double peak_value;
     double max_expansion;
     double max_compression;
@@ -110,6 +113,8 @@ static const AVOption speechnorm_options[] = {
     { "i",      "set inverted filtering", OFFSET(invert), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
     { "link", "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
     { "l",    "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
+    { "rms", "set the RMS value", OFFSET(rms_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.0}, 0.0, 1.0, FLAGS },
+    { "m",   "set the RMS value", OFFSET(rms_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.0}, 0.0, 1.0, FLAGS },
     { NULL }
 };
 
@@ -161,12 +166,16 @@ static void consume_pi(ChannelContext *cc, int nb_samples)
     }
 }
 
-static double next_gain(AVFilterContext *ctx, double pi_max_peak, int bypass, double state)
+static double next_gain(AVFilterContext *ctx, double pi_max_peak, int bypass, double state,
+                        double pi_rms_sum, int pi_size)
 {
     SpeechNormalizerContext *s = ctx->priv;
-    const double expansion = FFMIN(s->max_expansion, s->peak_value / pi_max_peak);
     const double compression = 1. / s->max_compression;
     const int type = s->invert ? pi_max_peak <= s->threshold_value : pi_max_peak >= s->threshold_value;
+    double expansion = FFMIN(s->max_expansion, s->peak_value / pi_max_peak);
+
+    if (s->rms_value > DBL_EPSILON)
+        expansion = FFMIN(expansion, s->rms_value / sqrt(pi_rms_sum / pi_size));
 
     if (bypass) {
         return 1.;
@@ -187,13 +196,15 @@ static void next_pi(AVFilterContext *ctx, ChannelContext *cc, int bypass)
         av_assert1(cc->pi[start].size > 0);
         av_assert0(cc->pi[start].type > 0 || s->eof);
         cc->pi_size = cc->pi[start].size;
+        cc->pi_rms_sum = cc->pi[start].rms_sum;
         cc->pi_max_peak = cc->pi[start].max_peak;
         av_assert1(cc->pi_start != cc->pi_end || s->eof);
         start++;
         if (start >= MAX_ITEMS)
             start = 0;
         cc->pi_start = start;
-        cc->gain_state = next_gain(ctx, cc->pi_max_peak, bypass, cc->gain_state);
+        cc->gain_state = next_gain(ctx, cc->pi_max_peak, bypass, cc->gain_state,
+                                   cc->pi_rms_sum, cc->pi_size);
     }
 }
 
@@ -209,7 +220,8 @@ static double min_gain(AVFilterContext *ctx, ChannelContext *cc, int max_size)
     while (size <= max_size) {
         if (idx == cc->pi_end)
             break;
-        gain_state = next_gain(ctx, cc->pi[idx].max_peak, 0, gain_state);
+        gain_state = next_gain(ctx, cc->pi[idx].max_peak, 0, gain_state,
+                               cc->pi[idx].rms_sum, cc->pi[idx].size);
         min_gain = FFMIN(min_gain, gain_state);
         size += cc->pi[idx].size;
         idx++;
@@ -236,11 +248,13 @@ static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,
                                                                                 \
     while (n < nb_samples) {                                                    \
         ptype new_max_peak;                                                     \
+        ptype new_rms_sum;                                                      \
         int new_size;                                                           \
                                                                                 \
         if ((cc->state != (src[n] >= zero)) ||                                  \
             (pi[pi_end].size > max_period)) {                                   \
             ptype max_peak = pi[pi_end].max_peak;                               \
+            ptype rms_sum = pi[pi_end].rms_sum;                                 \
             int state = cc->state;                                              \
                                                                                 \
             cc->state = src[n] >= zero;                                         \
@@ -251,10 +265,13 @@ static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,
                 pi_end++;                                                       \
                 if (pi_end >= MAX_ITEMS)                                        \
                     pi_end = 0;                                                 \
-                if (cc->state != state)                                         \
+                if (cc->state != state) {                                       \
                     pi[pi_end].max_peak = DBL_MIN;                              \
-                else                                                            \
+                    pi[pi_end].rms_sum = 0.0;                                   \
+                } else {                                                        \
                     pi[pi_end].max_peak = max_peak;                             \
+                    pi[pi_end].rms_sum = rms_sum;                               \
+                }                                                               \
                 pi[pi_end].type = 0;                                            \
                 pi[pi_end].size = 0;                                            \
                 av_assert1(pi_end != cc->pi_start);                             \
@@ -262,10 +279,12 @@ static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,
         }                                                                       \
                                                                                 \
         new_max_peak = pi[pi_end].max_peak;                                     \
+        new_rms_sum = pi[pi_end].rms_sum;                                       \
         new_size = pi[pi_end].size;                                             \
         if (cc->state) {                                                        \
             while (src[n] >= zero) {                                            \
                 new_max_peak = FFMAX(new_max_peak,  src[n]);                    \
+                new_rms_sum += src[n] * src[n];                                 \
                 new_size++;                                                     \
                 n++;                                                            \
                 if (n >= nb_samples)                                            \
@@ -274,6 +293,7 @@ static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,
         } else {                                                                \
             while (src[n] < zero) {                                             \
                 new_max_peak = FFMAX(new_max_peak, -src[n]);                    \
+                new_rms_sum += src[n] * src[n];                                 \
                 new_size++;                                                     \
                 n++;                                                            \
                 if (n >= nb_samples)                                            \
@@ -282,6 +302,7 @@ static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,
         }                                                                       \
                                                                                 \
         pi[pi_end].max_peak = new_max_peak;                                     \
+        pi[pi_end].rms_sum = new_rms_sum;                                       \
         pi[pi_end].size = new_size;                                             \
     }                                                                           \
     cc->pi_end = pi_end;                                                        \
author	Paul B Mahol <onemda@gmail.com>	2022-11-06 16:03:27 +0300
committer	Paul B Mahol <onemda@gmail.com>	2022-11-07 10:32:48 +0300
commit	7027101904a483e1c6a6db89b47b3362c37155a0 (patch)
tree	5ab15a46a604629cf2417fb638cc773d1b2fb357
parent	b48d2320f118f8a687a7e65f57adecbe3479e7a0 (diff)