Complete code -- totally untested

not even compiled yet
author: Jean-Marc Valin <jmvalin@amazon.com> 2023-10-11 09:13:10 +0300
committer: Jean-Marc Valin <jmvalin@amazon.com> 2023-10-11 09:13:10 +0300
commit: 366b42fed2f686915164decad4b09b5398dd053a (patch)
tree: 4cd515c6840f88f1530da4216fe2ddafcf035514
parent: 3547754b6f3b25480ad47f6b04ca576f2de5db71 (diff)
3 files changed, 96 insertions, 75 deletions
diff --git a/dnn/fargan.c b/dnn/fargan.c
index 34ab8d8f..4a325dac 100644
--- a/dnn/fargan.c
+++ b/dnn/fargan.c
@@ -39,17 +39,20 @@
 
 #define FARGAN_FEATURES (NB_FEATURES)
 
-static void compute_fargan_cond(FARGANState *st, float *cond, const float *features)
+static void compute_fargan_cond(FARGANState *st, float *cond, const float *features, int period)
 {
   FARGAN *model;
+  float dense_in[NB_FEATURES+COND_NET_PEMBED_OUT_SIZE];
   float conv1_in[COND_NET_FCONV1_IN_SIZE];
   float conv2_in[COND_NET_FCONV2_IN_SIZE];
   model = &st->model;
-  celt_assert(FARGAN_FEATURES == model->cond_net_fdense1.nb_inputs);
+  celt_assert(FARGAN_FEATURES+COND_NET_PEMBED_OUT_SIZE == model->cond_net_fdense1.nb_inputs);
   celt_assert(COND_NET_FCONV1_IN_SIZE == model->cond_net_fdense1.nb_outputs);
   celt_assert(COND_NET_FCONV2_IN_SIZE == model->cond_net_fconv1.nb_outputs);
+  OPUS_COPY(&dense_in[NB_FEATURES], &model->cond_net_pembed.float_weights[IMIN(period, 224)*COND_NET_PEMBED_OUT_SIZE], COND_NET_PEMBED_OUT_SIZE);
+  OPUS_COPY(dense_in, features, NB_FEATURES);
 
-  compute_generic_dense(&model->cond_net_fdense1, conv1_in, features, ACTIVATION_TANH);
+  compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH);
   compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH);
   compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH);
 }
@@ -74,55 +77,72 @@ static void fargan_deemphasis(float *pcm, float *deemph_mem) {
 static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond, int period)
 {
   int i, pos;
-  float tmp1[FWC1_FC_0_OUT_SIZE];
-  float tmp2[IMAX(RNN_GRU_STATE_SIZE, FWC2_FC_0_OUT_SIZE)];
-  float fwc0_in[FARGAN_COND_SIZE+2*FARGAN_SUBFRAME_SIZE];
-  float rnn_in[FEAT_IN_CONV1_CONV_OUT_SIZE];
+  float fwc0_in[SIG_NET_INPUT_SIZE];
+  float gru1_in[SIG_NET_FWC0_CONV_OUT_SIZE+2*FARGAN_SUBFRAME_SIZE];
+  float gru2_in[SIG_NET_GRU1_OUT_SIZE+2*FARGAN_SUBFRAME_SIZE];
+  float gru3_in[SIG_NET_GRU2_OUT_SIZE+2*FARGAN_SUBFRAME_SIZE];
   float pembed[FARGAN_FRAME_SIZE/2];
+  float pred[FARGAN_SUBFRAME_SIZE+4];
+  float prev[FARGAN_SUBFRAME_SIZE];
+  float pitch_gate[4];
+  float gain;
+  float gain_1;
+  float skip_cat[10000];
+  float skip_out[SIG_NET_SKIP_DENSE_OUT_SIZE];
+
   FARGAN *model;
   model = &st->model;
 
-  /* Interleave bfcc_cond and pembed for each subframe in feat_in. */
-  OPUS_COPY(&fwc0_in[0], &cond[0], FARGAN_COND_SIZE);
-  pos = PITCH_MAX_PERIOD-period;
-  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) {
-    fwc0_in[FARGAN_COND_SIZE+i] = st->pitch_buf[pos++];
+  compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR);
+  gain = exp(gain);
+  gain_1 = 1.f/(1e-5 + gain);
+
+  pos = PITCH_MAX_PERIOD-period-2;
+  for (i=0;i<FARGAN_SUBFRAME_SIZE+4;i++) {
+    pred[i] = MIN32(1.f, MAX32(-1.f, gain_1*st->pitch_buf[IMAX(0, pos)]));
+    pos++;
     if (pos == PITCH_MAX_PERIOD) pos -= period;
   }
-  OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE], st->pitch_buf[PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE], FARGAN_SUBFRAME_SIZE);
+  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) prev[i] = gain_1*st->pitch_buf[PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE+i];
 
-  compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, FARGAN_COND_SIZE+2*FARGAN_SUBFRAME_SIZE, ACTIVATION_TANH);
-  celt_assert(FEAT_IN_NL1_GATE_OUT_SIZE == model->feat_in_nl1_gate.nb_outputs);
-  compute_glu(&model->sig_net_fwc0_glu_gate, rnn_in, rnn_in);
+  OPUS_COPY(&fwc0_in[0], &cond[0], FARGAN_COND_SIZE);
+  OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE], pred, FARGAN_SUBFRAME_SIZE+4);
+  OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE+FARGAN_SUBFRAME_SIZE+4], prev, FARGAN_SUBFRAME_SIZE);
 
-  compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, rnn_in);
-  celt_assert(IMAX(RNN_GRU_STATE_SIZE, FWC2_FC_0_OUT_SIZE) >= model->rnn_nl_gate.nb_outputs);
-  compute_glu(&model->rnn_nl_gate, tmp2, st->gru1_state);
+  compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH);
+  celt_assert(SIG_NET_FWC0_GLU_GATE_OUT_SIZE == model->sig_net_fwc0_glu_gate.nb_outputs);
+  compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in);
 
-  compute_generic_conv1d(&model->fwc1_fc_0, tmp1, st->fwc1_state, tmp2, RNN_GRU_STATE_SIZE, ACTIVATION_LINEAR);
-  compute_gated_activation(&model->fwc1_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH);
+  compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID);
 
-  compute_generic_conv1d(&model->fwc2_fc_0, tmp2, st->fwc2_state, tmp1, FWC1_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
-  compute_gated_activation(&model->fwc2_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH);
+  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+i] = pitch_gate[0]*pred[i+2];
+  OPUS_COPY(&gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
+  compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in);
+  compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state);
 
-  compute_generic_conv1d(&model->fwc3_fc_0, tmp1, st->fwc3_state, tmp2, FWC2_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
-  compute_gated_activation(&model->fwc3_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH);
+  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru2_in[SIG_NET_GRU1_OUT_SIZE+i] = pitch_gate[1]*pred[i+2];
+  OPUS_COPY(&gru2_in[SIG_NET_GRU1_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
+  compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in);
+  compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state);
 
-  compute_generic_conv1d(&model->fwc4_fc_0, tmp2, st->fwc4_state, tmp1, FWC3_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
-  compute_gated_activation(&model->fwc4_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH);
+  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru3_in[SIG_NET_GRU1_OUT_SIZE+i] = pitch_gate[2]*pred[i+2];
+  OPUS_COPY(&gru3_in[SIG_NET_GRU2_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
+  compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in);
+  compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state);
 
-  compute_generic_conv1d(&model->fwc5_fc_0, tmp1, st->fwc5_state, tmp2, FWC4_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
-  compute_gated_activation(&model->fwc5_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH);
+  OPUS_COPY(skip_cat, gru2_in, SIG_NET_GRU1_OUT_SIZE);
+  OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE], gru3_in, SIG_NET_GRU2_OUT_SIZE);
+  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+i] = pitch_gate[3]*pred[i+2];
+  OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
 
-  compute_generic_conv1d(&model->fwc6_fc_0, tmp2, st->fwc6_state, tmp1, FWC5_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
-  compute_gated_activation(&model->fwc6_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH);
+  compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH);
+  compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out);
 
-  compute_generic_conv1d(&model->fwc7_fc_0, tmp1, st->fwc7_state, tmp2, FWC6_FC_0_OUT_SIZE, ACTIVATION_LINEAR);
-  compute_gated_activation(&model->fwc7_fc_1_gate, pcm, tmp1, ACTIVATION_TANH);
+  compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH);
+  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) pcm[i] *= gain;
 
-  apply_gain(pcm, c0, &st->last_gain);
-  fargan_preemphasis(pcm, &st->preemph_mem);
-  fargan_lpc_syn(pcm, st->syn_mem, lpc, st->last_lpc);
+  OPUS_MOVE(st->pitch_buf, &st->pitch_buf[FARGAN_SUBFRAME_SIZE], FARGAN_SUBFRAME_SIZE);
+  OPUS_COPY(&st->pitch_buf[PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE], pcm, FARGAN_SUBFRAME_SIZE);
   fargan_deemphasis(pcm, &st->deemph_mem);
 }
 
@@ -130,47 +150,34 @@ void fargan_cont(FARGANState *st, const float *pcm0, const float *features0)
 {
   int i;
   float norm2, norm_1;
-  float wpcm0[CONT_PCM_INPUTS];
-  float cont_inputs[CONT_PCM_INPUTS+1];
-  float tmp1[MAX_CONT_SIZE];
-  float tmp2[MAX_CONT_SIZE];
-  float lpc[LPC_ORDER];
   float new_pcm[FARGAN_FRAME_SIZE];
   FARGAN *model;
-  st->embed_phase[0] = 1;
+  float cond[COND_NET_FCONV2_OUT_SIZE];
+  float x0[FARGAN_CONT_SAMPLES];
+  float dummy[FARGAN_SUBFRAME_SIZE];
+  int period;
   model = &st->model;
-  compute_wlpc(lpc, features0);
-  /* Deemphasis memory is just the last continuation sample. */
-  st->deemph_mem = pcm0[CONT_PCM_INPUTS-1];
-
-  /* Apply analysis filter, considering that the preemphasis and deemphasis filter
-     cancel each other in this case since the LPC filter is constant across that boundary.
-     */
-  for (i=LPC_ORDER;i<CONT_PCM_INPUTS;i++) {
-    int j;
-    wpcm0[i] = pcm0[i];
-    for (j=0;j<LPC_ORDER;j++) wpcm0[i] += lpc[j]*pcm0[i-j-1];
-  }
-  /* FIXME: Make this less stupid. */
-  for (i=0;i<LPC_ORDER;i++) wpcm0[i] = wpcm0[LPC_ORDER];
 
-  /* The memory of the pre-empahsis is the last sample of the weighted signal
-     (ignoring preemphasis+deemphasis combination). */
-  st->preemph_mem = wpcm0[CONT_PCM_INPUTS-1];
-  /* The memory of the synthesis filter is the pre-emphasized continuation. */
-  for (i=0;i<LPC_ORDER;i++) st->syn_mem[i] = pcm0[CONT_PCM_INPUTS-1-i] - FARGAN_DEEMPHASIS*pcm0[CONT_PCM_INPUTS-2-i];
+  /* Pre-load features. */
+  for (i=0;i<5;i++) {
+    float *features = &features0[i*NB_FEATURES];
+    st->last_period = period;
+    period = (int)floor(.5+256./pow(2.f,((1./60.)*((features[NB_BANDS]+1.5)*60))));
+    compute_fargan_cond(st, cond, features, period);
+  }
 
-  norm2 = celt_inner_prod(wpcm0, wpcm0, CONT_PCM_INPUTS, st->arch);
-  norm_1 = 1.f/sqrt(1e-8f + norm2);
-  for (i=0;i<CONT_PCM_INPUTS;i++) cont_inputs[i+1] = norm_1*wpcm0[i];
-  cont_inputs[0] = log(sqrt(norm2) + 1e-7f);
+  x0[0] = 0;
+  for (i=1;i<FARGAN_CONT_SAMPLES;i++) {
+    x0[i] = pcm0[i] - FARGAN_DEEMPHASIS*pcm0[i-1];
+  }
 
+  OPUS_COPY(st->pitch_buf[PITCH_MAX_PERIOD-FARGAN_FRAME_SIZE], x0, FARGAN_FRAME_SIZE);
 
+  for (i=0;i<FARGAN_NB_SUBFRAMES;i++) {
+    run_fargan_subframe(st, dummy, &cond[i*FARGAN_COND_SIZE], st->last_period);
+    OPUS_COPY(&st->pitch_buf[PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE], x0[FARGAN_FRAME_SIZE+i*FARGAN_SUBFRAME_SIZE], FARGAN_SUBFRAME_SIZE);
+  }
   st->cont_initialized = 1;
-  /* Process the first frame, discard the first subframe, and keep the rest for the first
-     synthesis call. */
-  fargan_synthesize_impl(st, new_pcm, lpc, features0);
-  OPUS_COPY(st->pcm_buf, &new_pcm[SUBFRAME_SIZE], FARGAN_FRAME_SIZE-SUBFRAME_SIZE);
 }
 
 
@@ -201,12 +208,13 @@ static void fargan_synthesize_impl(FARGANState *st, float *pcm, const float *fea
   celt_assert(st->cont_initialized);
 
   period = (int)floor(.5+256./pow(2.f,((1./60.)*((features[NB_BANDS]+1.5)*60))));
-  compute_fargan_cond(st, cond, features);
+  compute_fargan_cond(st, cond, features, period);
   for (subframe=0;subframe<FARGAN_NB_SUBFRAMES;subframe++) {
     float *sub_cond;
     sub_cond = &cond[subframe*FARGAN_COND_SIZE];
-    run_fargan_subframe(st, &pcm[subframe*FARGAN_SUBFRAME_SIZE], sub_cond, period);
+    run_fargan_subframe(st, &pcm[subframe*FARGAN_SUBFRAME_SIZE], sub_cond, st->last_period);
   }
+  st->last_period = period;
 }
 
 void fargan_synthesize(FARGANState *st, float *pcm, const float *features)
diff --git a/dnn/fargan.h b/dnn/fargan.h
index 71e70069..e3fc3b8b 100644
--- a/dnn/fargan.h
+++ b/dnn/fargan.h
@@ -38,6 +38,8 @@
 #define FARGAN_COND_SIZE (COND_NET_FCONV2_OUT_SIZE/FARGAN_SUBFRAME_SIZE)
 #define FARGAN_DEEMPHASIS 0.85f
 
+#define SIG_NET_INPUT_SIZE (FARGAN_COND_SIZE+2*FARGAN_SUBFRAME_SIZE+4)
+#define SIG_NET_FWC0_STATE_SIZE (2*SIG_NET_INPUT_SIZE)
 
 typedef struct {
   FARGAN model;
@@ -48,9 +50,11 @@ typedef struct {
   float pitch_buf[PITCH_MAX_PERIOD];
   float cond_conv1_state[COND_NET_FCONV1_STATE_SIZE];
   float cond_conv2_state[COND_NET_FCONV2_STATE_SIZE];
+  float fwc0_mem[SIG_NET_FWC0_STATE_SIZE];
   float gru1_state[SIG_NET_GRU1_STATE_SIZE];
   float gru2_state[SIG_NET_GRU2_STATE_SIZE];
   float gru3_state[SIG_NET_GRU3_STATE_SIZE];
+  int last_period;
 } FARGANState;
 
 void fargan_init(FARGANState *st);
diff --git a/dnn/torch/weight-exchange/wexchange/torch/torch.py b/dnn/torch/weight-exchange/wexchange/torch/torch.py
index e9edb1fd..a1e68fa4 100644
--- a/dnn/torch/weight-exchange/wexchange/torch/torch.py
+++ b/dnn/torch/weight-exchange/wexchange/torch/torch.py
@@ -189,11 +189,20 @@ def load_torch_conv2d_weights(where, conv):
                 conv.bias.set_(torch.from_numpy(b))
 
 
-def dump_torch_embedding_weights(where, emb):
-    os.makedirs(where, exist_ok=True)
+def dump_torch_embedding_weights(where, embed, name='embed', scale=1/128, sparse=False, diagonal=False, quantize=False):
 
-    w = emb.weight.detach().cpu().numpy().copy()
-    np.save(os.path.join(where, 'weight.npy'), w)
+    print("quantize = ", quantize)
+    w = embed.weight.detach().cpu().numpy().copy().transpose()
+    b = np.zeros(1, dtype=w.dtype)
+
+    if isinstance(where, CWriter):
+        return print_dense_layer(where, name, w, b, scale=scale, format='torch', sparse=sparse, diagonal=diagonal, quantize=quantize)
+
+    else:
+        os.makedirs(where, exist_ok=True)
+
+        np.save(os.path.join(where, 'weight.npy'), w)
+        np.save(os.path.join(where, 'bias.npy'), b)
 
 
 def load_torch_embedding_weights(where, emb):
author	Jean-Marc Valin <jmvalin@amazon.com>	2023-10-11 09:13:10 +0300
committer	Jean-Marc Valin <jmvalin@amazon.com>	2023-10-11 09:13:10 +0300
commit	366b42fed2f686915164decad4b09b5398dd053a (patch)
tree	4cd515c6840f88f1530da4216fe2ddafcf035514
parent	3547754b6f3b25480ad47f6b04ca576f2de5db71 (diff)