C implementation of FWGAN

author: Jean-Marc Valin <jmvalin@amazon.com> 2023-07-30 22:31:59 +0300
committer: Jean-Marc Valin <jmvalin@amazon.com> 2023-08-02 00:57:04 +0300
commit: 904f1cc7fc1d31f038fb8f4c30bd099ebda48e75 (patch)
tree: a45f75a73e880e312d0e4ebfda7485a7dcfc40ac
parent: 3816b0e001f34250e78927735d58d1a39768e108 (diff)
7 files changed, 230 insertions, 10 deletions
diff --git a/autogen.sh b/autogen.sh
index f87f4122..6131b9ab 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -9,7 +9,7 @@ set -e
 srcdir=`dirname $0`
 test -n "$srcdir" && cd "$srcdir"
 
-dnn/download_model.sh ad05730
+dnn/download_model.sh 3816b0e
 
 echo "Updating build configuration files, please wait...."
 
diff --git a/dnn/fwgan.c b/dnn/fwgan.c
index bce3670a..ea4f0e07 100644
--- a/dnn/fwgan.c
+++ b/dnn/fwgan.c
@@ -31,21 +31,207 @@
 #include "fwgan.h"
 #include "os_support.h"
 #include "freq.h"
+#include "fwgan_data.h"
+#include "lpcnet.h"
+#include "pitch.h"
+#include "nnet.h"
 
-void fwgan_init(FWGANState *st, const float *pcm)
+#define NB_SUBFRAMES 4
+#define SUBFRAME_SIZE 40
+#define FWGAN_FRAME_SIZE (NB_SUBFRAMES*SUBFRAME_SIZE)
+#define CONT_PCM_INPUTS 320
+#define MAX_CONT_SIZE CONT_NET_0_OUT_SIZE
+#define FWGAN_GAMMA 0.92f
+#define FWGAN_DEEMPHASIS 0.85f
+
+#define FEAT_IN_SIZE (BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4 + FWGAN_FRAME_SIZE/2)
+
+#define FWGAN_FEATURES (NB_FEATURES-1)
+
+static void pitch_embeddings(float *pembed, double *phase, double w0) {
+  int i;
+  /* FIXME: This could be speeded up by making phase a unit-norm complex value, rotating it
+     by exp(-i*w0) each sample, and renormalizing once in while.  */
+  for (i=0;i<SUBFRAME_SIZE;i++) {
+    *phase += w0;
+    pembed[i] = sin(*phase);
+    pembed[SUBFRAME_SIZE+i] = cos(*phase);
+  }
+}
+
+static void run_fwgan_upsampler(FWGANState *st, float *cond, const float *features)
 {
-  OPUS_CLEAR(st, 1);
+  FWGAN *model;
+  model = &st->model;
+  celt_assert(FWGAN_FEATURES == model->bfcc_with_corr_upsampler_fc.nb_inputs);
+  celt_assert(BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE == model->bfcc_with_corr_upsampler_fc.nb_outputs);
+  compute_generic_dense(&model->bfcc_with_corr_upsampler_fc, cond, features, ACTIVATION_TANH);
+}
+
+void fwgan_cont(FWGANState *st, const float *pcm0, const float *features0)
+{
+  int i;
+  float norm2, norm_1;
+  float cont_inputs[CONT_PCM_INPUTS+1];
+  float tmp1[MAX_CONT_SIZE];
+  float tmp2[MAX_CONT_SIZE];
+  FWGAN *model;
+  model = &st->model;
+  norm2 = celt_inner_prod(pcm0, pcm0, CONT_PCM_INPUTS, st->arch);
+  norm_1 = 1.f/sqrt(1e-8f + norm2);
+  for (i=0;i<CONT_PCM_INPUTS;i++) cont_inputs[i+1] = norm_1*pcm0[i];
+  cont_inputs[0] = log(sqrt(norm2) + 1e-7f);
+
+  compute_generic_dense(&model->cont_net_0, tmp1, cont_inputs, ACTIVATION_TANH);
+  compute_generic_dense(&model->cont_net_2, tmp2, tmp1, ACTIVATION_TANH);
+  compute_generic_dense(&model->cont_net_4, tmp1, tmp2, ACTIVATION_TANH);
+  compute_generic_dense(&model->cont_net_6, tmp2, tmp1, ACTIVATION_TANH);
+  compute_generic_dense(&model->cont_net_8, tmp1, tmp2, ACTIVATION_TANH);
+  celt_assert(CONT_NET_10_OUT_SIZE == model->cont_net_10.nb_outputs);
+  compute_generic_dense(&model->cont_net_10, cont_inputs, tmp1, ACTIVATION_TANH);
+
+  celt_assert(RNN_GRU_STATE_SIZE == model->rnn_cont_fc_0.nb_outputs);
+  compute_generic_dense(&model->rnn_cont_fc_0, st->rnn_state, cont_inputs, ACTIVATION_TANH);
+
+  celt_assert(FWC1_STATE_SIZE == model->fwc1_cont_fc_0.nb_outputs);
+  compute_generic_dense(&model->fwc1_cont_fc_0, st->fwc1_state, cont_inputs, ACTIVATION_TANH);
+  celt_assert(FWC2_STATE_SIZE == model->fwc2_cont_fc_0.nb_outputs);
+  compute_generic_dense(&model->fwc2_cont_fc_0, st->fwc2_state, cont_inputs, ACTIVATION_TANH);
+  celt_assert(FWC3_STATE_SIZE == model->fwc3_cont_fc_0.nb_outputs);
+  compute_generic_dense(&model->fwc3_cont_fc_0, st->fwc3_state, cont_inputs, ACTIVATION_TANH);
+  celt_assert(FWC4_STATE_SIZE == model->fwc4_cont_fc_0.nb_outputs);
+  compute_generic_dense(&model->fwc4_cont_fc_0, st->fwc4_state, cont_inputs, ACTIVATION_TANH);
+  celt_assert(FWC5_STATE_SIZE == model->fwc5_cont_fc_0.nb_outputs);
+  compute_generic_dense(&model->fwc5_cont_fc_0, st->fwc5_state, cont_inputs, ACTIVATION_TANH);
+  celt_assert(FWC6_STATE_SIZE == model->fwc6_cont_fc_0.nb_outputs);
+  compute_generic_dense(&model->fwc6_cont_fc_0, st->fwc6_state, cont_inputs, ACTIVATION_TANH);
+  celt_assert(FWC7_STATE_SIZE == model->fwc7_cont_fc_0.nb_outputs);
+  compute_generic_dense(&model->fwc7_cont_fc_0, st->fwc7_state, cont_inputs, ACTIVATION_TANH);
+
+  /* FIXME: Do we need to handle initial features? How? */
+}
+
+static void apply_gain(float *pcm, float c0, float *last_gain) {
+  int i;
+  float gain = pow(10.f, (0.5f*c0/sqrt(18.f)));
+  for (i=0;i<SUBFRAME_SIZE;i++) pcm[i] *= *last_gain;
+  *last_gain = gain;
+}
+
+static void fwgan_lpc_syn(float *pcm, float *mem, const float *lpc, float last_lpc[LPC_ORDER]) {
+  int i;
+  for (i=0;i<SUBFRAME_SIZE;i++) {
+    int j;
+    for (j=0;j<LPC_ORDER;j++) pcm[i] -= mem[j]*last_lpc[j];
+    OPUS_MOVE(&mem[1], &mem[0], LPC_ORDER-1);
+    mem[0] = pcm[i];
+  }
+  OPUS_COPY(last_lpc, lpc, LPC_ORDER);
+}
+
+static void fwgan_preemphasis(float *pcm, float *preemph_mem) {
+  int i;
+  for (i=0;i<SUBFRAME_SIZE;i++) {
+    float tmp = pcm[i];
+    pcm[i] -= FWGAN_DEEMPHASIS * *preemph_mem;
+    *preemph_mem = tmp;
+  }
 }
 
-static void run_fwgan(FWGANState *st, float *pcm, const float *input)
+static void fwgan_deemphasis(float *pcm, float *deemph_mem) {
+  int i;
+  for (i=0;i<FWGAN_FRAME_SIZE;i++) {
+    pcm[i] += FWGAN_DEEMPHASIS * *deemph_mem;
+    *deemph_mem = pcm[i];
+  }
+}
+
+static void run_fwgan_subframe(FWGANState *st, float *pcm, const float *cond, double w0)
 {
+  float tmp1[FWC1_FC_0_OUT_SIZE];
+  float tmp2[IMAX(RNN_GRU_STATE_SIZE, FWC2_FC_0_OUT_SIZE)];
+  float feat_in[FEAT_IN_SIZE];
+  float rnn_in[FEAT_IN_CONV1_CONV_OUT_SIZE];
+  float pembed[FWGAN_FRAME_SIZE/2];
+  FWGAN *model;
+  model = &st->model;
+
+  pitch_embeddings(pembed, &st->embed_phase, w0);
+  /* Interleave bfcc_cond and pembed for each subframe in feat_in. */
+  OPUS_COPY(&feat_in[BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4], &cond[0], BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4);
+  OPUS_COPY(&feat_in[0], &pembed[0], FWGAN_FRAME_SIZE/2);
+
+  compute_generic_conv1d(&model->feat_in_conv1_conv, rnn_in, st->cont_conv1_mem, feat_in, FEAT_IN_CONV1_CONV_IN_SIZE, ACTIVATION_LINEAR);
+  celt_assert(FEAT_IN_NL1_GATE_OUT_SIZE == model->feat_in_nl1_gate.nb_outputs);
+  compute_gated_activation(&model->feat_in_nl1_gate, rnn_in, rnn_in, ACTIVATION_TANH);
+
+
+  compute_generic_gru(&model->rnn_gru_input, &model->rnn_gru_recurrent, st->rnn_state, rnn_in);
+  celt_assert(IMAX(RNN_GRU_STATE_SIZE, FWC2_FC_0_OUT_SIZE) >= model->rnn_nl_gate.nb_outputs);
+  compute_gated_activation(&model->rnn_nl_gate, tmp2, st->rnn_state, ACTIVATION_TANH);
+
+  compute_generic_conv1d(&model->fwc1_fc_0, tmp1, st->fwc1_state, tmp2, RNN_GRU_STATE_SIZE, ACTIVATION_LINEAR);
+  compute_gated_activation(&model->fwc1_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH);
+
+  compute_generic_conv1d(&model->fwc2_fc_0, tmp2, st->fwc2_state, tmp1, FWC1_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
+  compute_gated_activation(&model->fwc2_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH);
+
+  compute_generic_conv1d(&model->fwc3_fc_0, tmp1, st->fwc3_state, tmp2, FWC2_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
+  compute_gated_activation(&model->fwc3_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH);
+
+  compute_generic_conv1d(&model->fwc4_fc_0, tmp2, st->fwc4_state, tmp1, FWC3_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
+  compute_gated_activation(&model->fwc4_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH);
 
+  compute_generic_conv1d(&model->fwc5_fc_0, tmp1, st->fwc5_state, tmp2, FWC4_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
+  compute_gated_activation(&model->fwc5_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH);
+
+  compute_generic_conv1d(&model->fwc6_fc_0, tmp2, st->fwc6_state, tmp1, FWC5_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
+  compute_gated_activation(&model->fwc6_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH);
+
+  compute_generic_conv1d(&model->fwc7_fc_0, tmp1, st->fwc7_state, tmp2, FWC6_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
+  compute_gated_activation(&model->fwc7_fc_1_gate, pcm, tmp1, ACTIVATION_TANH);
+}
+
+
+
+void fwgan_init(FWGANState *st)
+{
+  int ret;
+  OPUS_CLEAR(st, 1);
+  ret = init_fwgan(&st->model, fwgan_arrays);
+  celt_assert(ret == 0);
+  /* FIXME: perform arch detection. */
 }
 
 void fwgan_synthesize(FWGANState *st, float *pcm, const float *features)
 {
+  int subframe;
   float lpc[LPC_ORDER];
+  float cond[BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE];
+  double w0;
+  int period;
+  float lpc_weight;
+  float fwgan_features[NB_FEATURES-1];
+  int i;
+  OPUS_COPY(fwgan_features, features, NB_FEATURES-2);
+  fwgan_features[NB_FEATURES-2] = features[NB_FEATURES-1]+.5;
+
+  period = (int)floor(.1 + 50*features[NB_BANDS]+100);
+  w0 = 2*M_PI/period;
   lpc_from_cepstrum(lpc, features);
-  run_fwgan(st, pcm, features);
-  /* Run LPC filter. */
+  lpc_weight = 1.f;
+  for (i=0;i<LPC_ORDER;i++) {
+    lpc_weight *= FWGAN_GAMMA;
+    lpc[i] *= lpc_weight;
+  }
+  run_fwgan_upsampler(st, cond, fwgan_features);
+  for (subframe=0;subframe<NB_SUBFRAMES;subframe++) {
+    float *sub_cond;
+    sub_cond = &cond[subframe*BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4];
+    run_fwgan_subframe(st, &pcm[subframe*SUBFRAME_SIZE], sub_cond, w0);
+    apply_gain(&pcm[subframe*SUBFRAME_SIZE], features[0], &st->last_gain);
+    fwgan_preemphasis(&pcm[subframe*SUBFRAME_SIZE], &st->preemph_mem);
+    fwgan_lpc_syn(&pcm[subframe*SUBFRAME_SIZE], st->syn_mem, lpc, st->last_lpc);
+  }
+  fwgan_deemphasis(pcm, &st->deemph_mem);
 }
diff --git a/dnn/fwgan.h b/dnn/fwgan.h
index 84749176..7da11b8f 100644
--- a/dnn/fwgan.h
+++ b/dnn/fwgan.h
@@ -28,15 +28,43 @@
 #define FWGAN_H
 
 #include "freq.h"
+#include "fwgan_data.h"
 
 #define FWGAN_CONT_SAMPLES 320
 
+/* FIXME: Derive those from the model rather than hardcoding. */
+#define FWC1_STATE_SIZE 512
+#define FWC2_STATE_SIZE 512
+#define FWC3_STATE_SIZE 256
+#define FWC4_STATE_SIZE 256
+#define FWC5_STATE_SIZE 128
+#define FWC6_STATE_SIZE 128
+#define FWC7_STATE_SIZE 80
+
 typedef struct {
+  FWGAN model;
+  int arch;
+  double embed_phase;
+  float last_gain;
+  float last_lpc[LPC_ORDER];
   float syn_mem[LPC_ORDER];
+  float preemph_mem;
+  float deemph_mem;
+  float cont_conv1_mem[FEAT_IN_CONV1_CONV_STATE_SIZE];
+  float cont[FEAT_IN_NL1_GATE_OUT_SIZE];
+  float rnn_state[RNN_GRU_STATE_SIZE];
+  float fwc1_state[FWC1_STATE_SIZE];
+  float fwc2_state[FWC2_STATE_SIZE];
+  float fwc3_state[FWC3_STATE_SIZE];
+  float fwc4_state[FWC4_STATE_SIZE];
+  float fwc5_state[FWC5_STATE_SIZE];
+  float fwc6_state[FWC6_STATE_SIZE];
+  float fwc7_state[FWC7_STATE_SIZE];
 } FWGANState;
 
-void fwgan_init(FWGANState *st, const float *pcm);
+void fwgan_init(FWGANState *st);
 
+void fwgan_cont(FWGANState *st, const float *pcm0, const float *features0);
 
 void fwgan_synthesize(FWGANState *st, float *pcm, const float *features);
 
diff --git a/dnn/nnet.c b/dnn/nnet.c
index 05b0ea90..0bb228fe 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -73,6 +73,7 @@ void compute_linear(const LinearLayer *linear, float *out, const float *in)
 {
    int i, M, N;
    const float *bias;
+   celt_assert(in != out);
    bias = linear->bias;
    M = linear->nb_inputs;
    N = linear->nb_outputs;
@@ -146,11 +147,12 @@ void compute_gated_activation(const LinearLayer *layer, float *output, const flo
 {
    int i;
    float act1[MAX_INPUTS];
+   float act2[MAX_INPUTS];
    celt_assert(layer->nb_inputs == layer->nb_outputs);
-   compute_linear(layer, output, input);
-   compute_activation(output, output, layer->nb_outputs, ACTIVATION_SIGMOID);
    compute_activation(act1, input, layer->nb_outputs, activation);
-   for (i=0;i<layer->nb_outputs;i++) output[i] *= act1[i];
+   compute_linear(layer, act2, input);
+   compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID);
+   for (i=0;i<layer->nb_outputs;i++) output[i] = act1[i]*act2[i];
 }
 
 void compute_activation(float *output, const float *input, int N, int activation)
diff --git a/dnn/nnet.h b/dnn/nnet.h
index 2916b33f..2b43308a 100644
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -135,6 +135,7 @@ void compute_linear(const LinearLayer *linear, float *out, const float *in);
 void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation);
 void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in);
 void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation);
+void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation);
 
 void compute_activation(float *output, const float *input, int N, int activation);
 
@@ -163,6 +164,7 @@ extern const WeightArray lpcnet_arrays[];
 extern const WeightArray lpcnet_plc_arrays[];
 extern const WeightArray rdovaeenc_arrays[];
 extern const WeightArray rdovaedec_arrays[];
+extern const WeightArray fwgan_arrays[];
 
 int linear_init(LinearLayer *layer, const WeightArray *arrays,
   const char *bias,
diff --git a/lpcnet_headers.mk b/lpcnet_headers.mk
index 93ca74fb..fc3fc84c 100644
--- a/lpcnet_headers.mk
+++ b/lpcnet_headers.mk
@@ -9,6 +9,7 @@ dnn/burg.h \
 dnn/common.h \
 dnn/freq.h \
 dnn/fwgan.h \
+dnn/fwgan_data.h \
 dnn/kiss99.h \
 dnn/lpcnet_private.h \
 dnn/nnet_data.h \
diff --git a/lpcnet_sources.mk b/lpcnet_sources.mk
index 4c6e73f3..61cbb1f1 100644
--- a/lpcnet_sources.mk
+++ b/lpcnet_sources.mk
@@ -2,6 +2,7 @@ LPCNET_SOURCES = \
 dnn/burg.c \
 dnn/freq.c \
 dnn/fwgan.c \
+dnn/fwgan_data.c \
 dnn/kiss99.c \
 dnn/lpcnet.c \
 dnn/lpcnet_enc.c \
author	Jean-Marc Valin <jmvalin@amazon.com>	2023-07-30 22:31:59 +0300
committer	Jean-Marc Valin <jmvalin@amazon.com>	2023-08-02 00:57:04 +0300
commit	904f1cc7fc1d31f038fb8f4c30bd099ebda48e75 (patch)
tree	a45f75a73e880e312d0e4ebfda7485a7dcfc40ac
parent	3816b0e001f34250e78927735d58d1a39768e108 (diff)