Adding RTCD for DNN code

Starting with compute_linear()
author: Jean-Marc Valin <jmvalin@amazon.com> 2023-11-14 02:26:31 +0300
committer: Jean-Marc Valin <jmvalin@amazon.com> 2023-11-16 07:45:32 +0300
commit: 2e034f6f312d752440b9e26afa82b0752c34d97b (patch)
tree: b3d5e0785b3a538517f234c94ba1c0b4fdcecc47
parent: b0620c0bf9864d9b18ead6b4bb6e0800542a931d (diff)
31 files changed, 539 insertions, 165 deletions
diff --git a/Makefile.am b/Makefile.am
index 452f6d22..046d3069 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -50,18 +50,30 @@ if CPU_X86
 if HAVE_RTCD
 CELT_SOURCES += $(CELT_SOURCES_X86_RTCD)
 SILK_SOURCES += $(SILK_SOURCES_X86_RTCD)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_X86_RTCD)
+endif
 endif
 if HAVE_SSE
 CELT_SOURCES += $(CELT_SOURCES_SSE)
 endif
 if HAVE_SSE2
 CELT_SOURCES += $(CELT_SOURCES_SSE2)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_SSE2)
+endif
 endif
 if HAVE_SSE4_1
 CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_SSE4_1)
+endif
 endif
 if HAVE_AVX2
 CELT_SOURCES += $(CELT_SOURCES_AVX2)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_AVX2)
+endif
 endif
 endif
 
@@ -398,19 +410,22 @@ $(SSE_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS)
 endif
 
 if HAVE_SSE2
-SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo)
+SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) \
+           $(DNN_SOURCES_SSE2:.c=.lo)
 $(SSE2_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS)
 endif
 
 if HAVE_SSE4_1
 SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \
+             $(DNN_SOURCES_SSE4_1:.c=.lo) \
              $(SILK_SOURCES_SSE4_1:.c=.lo) \
              $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo)
 $(SSE4_1_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
 endif
 
 if HAVE_AVX2
-AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo)
+AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) \
+           $(DNN_SOURCES_AVX2:.c=.lo)
 $(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS)
 endif
 
diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
index fe46d1d9..6ce10e60 100644
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@@ -47,7 +47,7 @@
 # endif
 
 # if defined(OPUS_X86_MAY_HAVE_AVX2)
-#  define MAY_HAVE_AVX2(name) name ## _avx
+#  define MAY_HAVE_AVX2(name) name ## _avx2
 # else
 #  define MAY_HAVE_AVX2(name) name ## _c
 # endif
diff --git a/dnn/dred_rdovae_dec.c b/dnn/dred_rdovae_dec.c
index e2b19b14..7797ee77 100644
--- a/dnn/dred_rdovae_dec.c
+++ b/dnn/dred_rdovae_dec.c
@@ -42,33 +42,35 @@ static void conv1_cond_init(float *mem, int len, int dilation, int *init)
     *init = 1;
 }
 
-void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents)
+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch)
 {
     int i;
     RDOVAEDecState dec;
     memset(&dec, 0, sizeof(dec));
-    dred_rdovae_dec_init_states(&dec, model, state);
+    dred_rdovae_dec_init_states(&dec, model, state, arch);
     for (i = 0; i < 2*nb_latents; i += 2)
     {
         dred_rdovae_decode_qframe(
             &dec,
             model,
             &features[2*i*DRED_NUM_FEATURES],
-            &latents[(i/2)*DRED_LATENT_DIM]);
+            &latents[(i/2)*DRED_LATENT_DIM],
+            arch);
     }
 }
 
 void dred_rdovae_dec_init_states(
     RDOVAEDecState *h,            /* io: state buffer handle */
     const RDOVAEDec *model,
-    const float *initial_state  /* i: initial state */
+    const float *initial_state,  /* i: initial state */
+    int arch
     )
 {
     float hidden[DEC_HIDDEN_INIT_OUT_SIZE];
     float state_init[DEC_GRU1_STATE_SIZE+DEC_GRU2_STATE_SIZE+DEC_GRU3_STATE_SIZE+DEC_GRU4_STATE_SIZE+DEC_GRU5_STATE_SIZE];
     int counter=0;
-    compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH);
-    compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH);
+    compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH, arch);
+    compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH, arch);
     OPUS_COPY(h->gru1_state, state_init, DEC_GRU1_STATE_SIZE);
     counter += DEC_GRU1_STATE_SIZE;
     OPUS_COPY(h->gru2_state, &state_init[counter], DEC_GRU2_STATE_SIZE);
@@ -86,7 +88,8 @@ void dred_rdovae_decode_qframe(
     RDOVAEDecState *dec_state,       /* io: state buffer handle */
     const RDOVAEDec *model,
     float *qframe,              /* o: quadruple feature frame (four concatenated frames in reverse order) */
-    const float *input          /* i: latent vector */
+    const float *input,          /* i: latent vector */
+    int arch
     )
 {
     float buffer[DEC_DENSE1_OUT_SIZE + DEC_GRU1_OUT_SIZE + DEC_GRU2_OUT_SIZE + DEC_GRU3_OUT_SIZE + DEC_GRU4_OUT_SIZE + DEC_GRU5_OUT_SIZE
@@ -94,43 +97,43 @@ void dred_rdovae_decode_qframe(
     int output_index = 0;
 
     /* run encoder stack and concatenate output in buffer*/
-    compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH);
+    compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);
     output_index += DEC_DENSE1_OUT_SIZE;
 
-    compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer);
-    compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state);
+    compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer, arch);
+    compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state, arch);
     output_index += DEC_GRU1_OUT_SIZE;
     conv1_cond_init(dec_state->conv1_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);
     output_index += DEC_CONV1_OUT_SIZE;
 
-    compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer);
-    compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state);
+    compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer, arch);
+    compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state, arch);
     output_index += DEC_GRU2_OUT_SIZE;
     conv1_cond_init(dec_state->conv2_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH, arch);
     output_index += DEC_CONV2_OUT_SIZE;
 
-    compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer);
-    compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state);
+    compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer, arch);
+    compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state, arch);
     output_index += DEC_GRU3_OUT_SIZE;
     conv1_cond_init(dec_state->conv3_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH, arch);
     output_index += DEC_CONV3_OUT_SIZE;
 
-    compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer);
-    compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state);
+    compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer, arch);
+    compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state, arch);
     output_index += DEC_GRU4_OUT_SIZE;
     conv1_cond_init(dec_state->conv4_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH, arch);
     output_index += DEC_CONV4_OUT_SIZE;
 
-    compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer);
-    compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state);
+    compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer, arch);
+    compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state, arch);
     output_index += DEC_GRU5_OUT_SIZE;
     conv1_cond_init(dec_state->conv5_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH, arch);
     output_index += DEC_CONV5_OUT_SIZE;
 
-    compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR);
+    compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR, arch);
 }
diff --git a/dnn/dred_rdovae_dec.h b/dnn/dred_rdovae_dec.h
index 636f0ee0..4e66911c 100644
--- a/dnn/dred_rdovae_dec.h
+++ b/dnn/dred_rdovae_dec.h
@@ -46,8 +46,8 @@ struct RDOVAEDecStruct {
   float conv5_state[DEC_CONV5_STATE_SIZE];
 };
 
-void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state);
-void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z);
-void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents);
+void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state, int arch);
+void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z, int arch);
+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch);
 
 #endif
diff --git a/dnn/dred_rdovae_enc.c b/dnn/dred_rdovae_enc.c
index e159e632..4f13ae21 100644
--- a/dnn/dred_rdovae_enc.c
+++ b/dnn/dred_rdovae_enc.c
@@ -50,7 +50,8 @@ void dred_rdovae_encode_dframe(
     const RDOVAEEnc *model,
     float *latents,                 /* o: latent vector */
     float *initial_state,           /* o: initial state */
-    const float *input              /* i: double feature frame (concatenated) */
+    const float *input,              /* i: double feature frame (concatenated) */
+    int arch
     )
 {
     float padded_latents[DRED_PADDED_LATENT_DIM];
@@ -61,49 +62,49 @@ void dred_rdovae_encode_dframe(
     int output_index = 0;
 
     /* run encoder stack and concatenate output in buffer*/
-    compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH);
+    compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);
     output_index += ENC_DENSE1_OUT_SIZE;
 
-    compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer);
+    compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer, arch);
     OPUS_COPY(&buffer[output_index], enc_state->gru1_state, ENC_GRU1_OUT_SIZE);
     output_index += ENC_GRU1_OUT_SIZE;
     conv1_cond_init(enc_state->conv1_state, output_index, 1, &enc_state->initialized);
-    compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);
     output_index += ENC_CONV1_OUT_SIZE;
 
-    compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer);
+    compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer, arch);
     OPUS_COPY(&buffer[output_index], enc_state->gru2_state, ENC_GRU2_OUT_SIZE);
     output_index += ENC_GRU2_OUT_SIZE;
     conv1_cond_init(enc_state->conv2_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
     output_index += ENC_CONV2_OUT_SIZE;
 
-    compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer);
+    compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer, arch);
     OPUS_COPY(&buffer[output_index], enc_state->gru3_state, ENC_GRU3_OUT_SIZE);
     output_index += ENC_GRU3_OUT_SIZE;
     conv1_cond_init(enc_state->conv3_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
     output_index += ENC_CONV3_OUT_SIZE;
 
-    compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer);
+    compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer, arch);
     OPUS_COPY(&buffer[output_index], enc_state->gru4_state, ENC_GRU4_OUT_SIZE);
     output_index += ENC_GRU4_OUT_SIZE;
     conv1_cond_init(enc_state->conv4_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
     output_index += ENC_CONV4_OUT_SIZE;
 
-    compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer);
+    compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer, arch);
     OPUS_COPY(&buffer[output_index], enc_state->gru5_state, ENC_GRU5_OUT_SIZE);
     output_index += ENC_GRU5_OUT_SIZE;
     conv1_cond_init(enc_state->conv5_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
     output_index += ENC_CONV5_OUT_SIZE;
 
-    compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR);
+    compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR, arch);
     OPUS_COPY(latents, padded_latents, DRED_LATENT_DIM);
 
     /* next, calculate initial state */
-    compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH);
-    compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR);
+    compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH, arch);
+    compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR, arch);
     OPUS_COPY(initial_state, padded_state, DRED_STATE_DIM);
 }
diff --git a/dnn/dred_rdovae_enc.h b/dnn/dred_rdovae_enc.h
index 43a4e8b2..6fe537ee 100644
--- a/dnn/dred_rdovae_enc.h
+++ b/dnn/dred_rdovae_enc.h
@@ -46,7 +46,7 @@ struct RDOVAEEncStruct {
     float conv5_state[2*ENC_CONV5_STATE_SIZE];
 };
 
-void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input);
+void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input, int arch);
 
 
 #endif
diff --git a/dnn/dump_data.c b/dnn/dump_data.c
index be1ff16e..e7acfb11 100644
--- a/dnn/dump_data.c
+++ b/dnn/dump_data.c
@@ -42,6 +42,7 @@
 #include "lpcnet.h"
 #include "lpcnet_private.h"
 #include "os_support.h"
+#include "cpu_support.h"
 
 
 static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
@@ -135,7 +136,9 @@ int main(int argc, char **argv) {
   FILE *fnoise = NULL;
   float noise_gain = 0;
   long noise_size=0;
+  int arch;
   srand(getpid());
+  arch = opus_select_arch();
   st = lpcnet_encoder_create();
   argv0=argv[0];
   if (argc == 5 && strcmp(argv[1], "-btrain")==0) {
@@ -244,7 +247,7 @@ int main(int argc, char **argv) {
     for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
     /* PCM is delayed by 1/2 frame to make the features centered on the frames. */
     for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
-    compute_frame_features(st, x);
+    compute_frame_features(st, x, arch);
 
     if (fpcm) {
         compute_noise(noisebuf, noise_std);
diff --git a/dnn/fargan.c b/dnn/fargan.c
index e0fa304c..5e01ebd0 100644
--- a/dnn/fargan.c
+++ b/dnn/fargan.c
@@ -36,6 +36,7 @@
 #include "pitch.h"
 #include "nnet.h"
 #include "lpcnet_private.h"
+#include "cpu_support.h"
 
 #define FARGAN_FEATURES (NB_FEATURES)
 
@@ -52,9 +53,9 @@ static void compute_fargan_cond(FARGANState *st, float *cond, const float *featu
   OPUS_COPY(&dense_in[NB_FEATURES], &model->cond_net_pembed.float_weights[IMAX(0,IMIN(period-32, 224))*COND_NET_PEMBED_OUT_SIZE], COND_NET_PEMBED_OUT_SIZE);
   OPUS_COPY(dense_in, features, NB_FEATURES);
 
-  compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH);
-  compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH);
-  compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH);
+  compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH, st->arch);
+  compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH, st->arch);
+  compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH, st->arch);
 }
 
 static void fargan_deemphasis(float *pcm, float *deemph_mem) {
@@ -84,7 +85,7 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
   celt_assert(st->cont_initialized);
   model = &st->model;
 
-  compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR);
+  compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR, st->arch);
   gain = exp(gain);
   gain_1 = 1.f/(1e-5f + gain);
 
@@ -100,26 +101,26 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
   OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE], pred, FARGAN_SUBFRAME_SIZE+4);
   OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE+FARGAN_SUBFRAME_SIZE+4], prev, FARGAN_SUBFRAME_SIZE);
 
-  compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH);
+  compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH, st->arch);
   celt_assert(SIG_NET_FWC0_GLU_GATE_OUT_SIZE == model->sig_net_fwc0_glu_gate.nb_outputs);
-  compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in);
+  compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in, st->arch);
 
-  compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID);
+  compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID, st->arch);
 
   for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+i] = pitch_gate[0]*pred[i+2];
   OPUS_COPY(&gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in);
-  compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state);
+  compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in, st->arch);
+  compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state, st->arch);
 
   for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru2_in[SIG_NET_GRU1_OUT_SIZE+i] = pitch_gate[1]*pred[i+2];
   OPUS_COPY(&gru2_in[SIG_NET_GRU1_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in);
-  compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state);
+  compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in, st->arch);
+  compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state, st->arch);
 
   for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru3_in[SIG_NET_GRU2_OUT_SIZE+i] = pitch_gate[2]*pred[i+2];
   OPUS_COPY(&gru3_in[SIG_NET_GRU2_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in);
-  compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state);
+  compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in, st->arch);
+  compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state, st->arch);
 
   OPUS_COPY(skip_cat, gru2_in, SIG_NET_GRU1_OUT_SIZE);
   OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE], gru3_in, SIG_NET_GRU2_OUT_SIZE);
@@ -127,10 +128,10 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
   for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+i] = pitch_gate[3]*pred[i+2];
   OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
 
-  compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH);
-  compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out);
+  compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH, st->arch);
+  compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out, st->arch);
 
-  compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH);
+  compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH, st->arch);
   for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) pcm[i] *= gain;
 
   OPUS_MOVE(st->pitch_buf, &st->pitch_buf[FARGAN_SUBFRAME_SIZE], PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE);
@@ -174,13 +175,13 @@ void fargan_init(FARGANState *st)
 {
   int ret;
   OPUS_CLEAR(st, 1);
+  st->arch = opus_select_arch();
 #ifndef USE_WEIGHTS_FILE
   ret = init_fargan(&st->model, fargan_arrays);
 #else
   ret = 0;
 #endif
   celt_assert(ret == 0);
-  /* FIXME: perform arch detection. */
 }
 
 int fargan_load_model(FARGANState *st, const unsigned char *data, int len) {
diff --git a/dnn/lpcnet.h b/dnn/lpcnet.h
index adcba515..ec39dc24 100644
--- a/dnn/lpcnet.h
+++ b/dnn/lpcnet.h
@@ -120,7 +120,7 @@ int lpcnet_encode(LPCNetEncState *st, const opus_int16 *pcm, unsigned char *buf)
   * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
   * @retval 0 Success
   */
-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);
+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch);
 
 
 /** Compute features on LPCNET_FRAME_SIZE speech samples (currently 160) and output features for one 10-ms frame.
@@ -129,7 +129,7 @@ int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *p
   * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
   * @retval 0 Success
   */
-int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]);
+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch);
 
 /** Gets the size of an <code>LPCNetState</code> structure.
   * @returns The size in bytes.
diff --git a/dnn/lpcnet_demo.c b/dnn/lpcnet_demo.c
index cfa9f6fd..aad31190 100644
--- a/dnn/lpcnet_demo.c
+++ b/dnn/lpcnet_demo.c
@@ -37,6 +37,7 @@
 #include "freq.h"
 #include "os_support.h"
 #include "fargan.h"
+#include "cpu_support.h"
 
 #ifdef USE_WEIGHTS_FILE
 # if __unix__
@@ -99,12 +100,14 @@ void usage(void) {
 
 int main(int argc, char **argv) {
     int mode=0;
+    int arch;
     FILE *fin, *fout;
 #ifdef USE_WEIGHTS_FILE
     int len;
     unsigned char *data;
     const char *filename = "weights_blob.bin";
 #endif
+    arch = opus_select_arch();
     if (argc < 4) usage();
     if (strcmp(argv[1], "-features") == 0) mode=MODE_FEATURES;
     else if (strcmp(argv[1], "-fargan-synthesis") == 0) mode=MODE_FARGAN_SYNTHESIS;
@@ -137,7 +140,7 @@ int main(int argc, char **argv) {
             size_t ret;
             ret = fread(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fin);
             if (feof(fin) || ret != LPCNET_FRAME_SIZE) break;
-            lpcnet_compute_single_frame_features(net, pcm, features);
+            lpcnet_compute_single_frame_features(net, pcm, features, arch);
             fwrite(features, sizeof(float), NB_TOTAL_FEATURES, fout);
         }
         lpcnet_encoder_destroy(net);
diff --git a/dnn/lpcnet_enc.c b/dnn/lpcnet_enc.c
index c2c5578b..8e3164df 100644
--- a/dnn/lpcnet_enc.c
+++ b/dnn/lpcnet_enc.c
@@ -95,7 +95,7 @@ static void biquad(float *y, float mem[2], const float *x, const float *b, const
 
 #define celt_log10(x) (0.3010299957f*celt_log2(x))
 
-void compute_frame_features(LPCNetEncState *st, const float *in) {
+void compute_frame_features(LPCNetEncState *st, const float *in, int arch) {
   float aligned_in[FRAME_SIZE];
   int i;
   float Ly[NB_BANDS];
@@ -142,7 +142,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
   OPUS_COPY(&x[0], st->pitch_mem, LPC_ORDER);
   OPUS_COPY(&x[LPC_ORDER], aligned_in, FRAME_SIZE);
   OPUS_COPY(st->pitch_mem, &aligned_in[FRAME_SIZE-LPC_ORDER], LPC_ORDER);
-  celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, st->arch);
+  celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, arch);
   for (i=0;i<FRAME_SIZE;i++) {
     st->exc_buf[PITCH_MAX_PERIOD+i] = st->lp_buf[PITCH_MAX_PERIOD+i] + .7f*st->pitch_filt;
     st->pitch_filt = st->lp_buf[PITCH_MAX_PERIOD+i];
@@ -152,7 +152,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
   {
     double ener1;
     float *buf = st->exc_buf;
-    celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, st->arch);
+    celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, arch);
     ener0 = celt_inner_prod_c(&buf[PITCH_MAX_PERIOD], &buf[PITCH_MAX_PERIOD], FRAME_SIZE);
     ener1 = celt_inner_prod_c(&buf[0], &buf[0], FRAME_SIZE-1);
     /*printf("%f\n", st->frame_weight[sub]);*/
@@ -165,7 +165,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
     }
     /*printf("\n");*/
   }
-  st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features);
+  st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features, arch);
 }
 
 void process_single_frame(LPCNetEncState *st, FILE *ffeat) {
@@ -196,26 +196,26 @@ void preemphasis(float *y, float *mem, const float *x, float coef, int N) {
   }
 }
 
-static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES]) {
+static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES], int arch) {
   preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
-  compute_frame_features(st, x);
+  compute_frame_features(st, x, arch);
   process_single_frame(st, NULL);
   OPUS_COPY(features, &st->features[0], NB_TOTAL_FEATURES);
   return 0;
 }
 
-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]) {
+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch) {
   int i;
   float x[FRAME_SIZE];
   for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
-  lpcnet_compute_single_frame_features_impl(st, x, features);
+  lpcnet_compute_single_frame_features_impl(st, x, features, arch);
   return 0;
 }
 
-int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]) {
+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch) {
   int i;
   float x[FRAME_SIZE];
   for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
-  lpcnet_compute_single_frame_features_impl(st, x, features);
+  lpcnet_compute_single_frame_features_impl(st, x, features, arch);
   return 0;
 }
diff --git a/dnn/lpcnet_plc.c b/dnn/lpcnet_plc.c
index de3ab1a7..b713110f 100644
--- a/dnn/lpcnet_plc.c
+++ b/dnn/lpcnet_plc.c
@@ -33,6 +33,7 @@
 #include "plc_data.h"
 #include "os_support.h"
 #include "common.h"
+#include "cpu_support.h"
 
 #ifndef M_PI
 #define M_PI 3.141592653
@@ -54,6 +55,7 @@ void lpcnet_plc_reset(LPCNetPLCState *st) {
 
 int lpcnet_plc_init(LPCNetPLCState *st) {
   int ret;
+  st->arch = opus_select_arch();
   fargan_init(&st->fargan);
   lpcnet_encoder_init(&st->enc);
   st->analysis_pos = PLC_BUF_SIZE;
@@ -109,10 +111,10 @@ static void compute_plc_pred(LPCNetPLCState *st, float *out, const float *in) {
   float dense_out[PLC_DENSE1_OUT_SIZE];
   PLCNetState *net = &st->plc_net;
   celt_assert(st->loaded);
-  _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in);
-  compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out);
-  compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state);
-  _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state);
+  _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in, st->arch);
+  compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out, st->arch);
+  compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state, st->arch);
+  _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state, st->arch);
 }
 
 static int get_fec_or_pred(LPCNetPLCState *st, float *out) {
@@ -164,7 +166,7 @@ int lpcnet_plc_conceal(LPCNetPLCState *st, opus_int16 *pcm) {
       float plc_features[2*NB_BANDS+NB_FEATURES+1];
       for (i=0;i<FRAME_SIZE;i++) x[i] = 32768.f*st->pcm[st->analysis_pos+i];
       burg_cepstral_analysis(plc_features, x);
-      lpcnet_compute_single_frame_features_float(&st->enc, x, st->features);
+      lpcnet_compute_single_frame_features_float(&st->enc, x, st->features, st->arch);
       if ((st->analysis_gap && count > 0) || count > 1) {
         queue_features(st, st->features);
         OPUS_COPY(&plc_features[2*NB_BANDS], st->features, NB_FEATURES);
diff --git a/dnn/lpcnet_private.h b/dnn/lpcnet_private.h
index 30931b1d..9a68c718 100644
--- a/dnn/lpcnet_private.h
+++ b/dnn/lpcnet_private.h
@@ -24,7 +24,6 @@
 
 struct LPCNetEncState{
   PitchDNNState pitchdnn;
-  int arch;
   float analysis_mem[OVERLAP_SIZE];
   float mem_preemph;
   kiss_fft_cpx prev_if[PITCH_IF_MAX_FREQ];
@@ -67,7 +66,7 @@ struct LPCNetPLCState {
 
 void preemphasis(float *y, float *mem, const float *x, float coef, int N);
 
-void compute_frame_features(LPCNetEncState *st, const float *in);
+void compute_frame_features(LPCNetEncState *st, const float *in, int arch);
 
 void lpcnet_reset_signal(LPCNetState *lpcnet);
 void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features);
@@ -79,7 +78,6 @@ void lpcnet_synthesize_tail_impl(LPCNetState *lpcnet, opus_int16 *output, int N,
 void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, opus_int16 *output, int N, int preload);
 void lpcnet_synthesize_blend_impl(LPCNetState *lpcnet, const opus_int16 *pcm_in, opus_int16 *output, int N);
 void process_single_frame(LPCNetEncState *st, FILE *ffeat);
-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);
 
 void process_single_frame(LPCNetEncState *st, FILE *ffeat);
 
diff --git a/dnn/nnet.c b/dnn/nnet.c
index c76e9f28..22fda89b 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -69,50 +69,16 @@ static OPUS_INLINE float relu(float x)
    return x < 0 ? 0 : x;
 }
 
-static void compute_linear(const LinearLayer *linear, float *out, const float *in)
+void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch)
 {
-   int i, M, N;
-   const float *bias;
-   celt_assert(in != out);
-   bias = linear->bias;
-   M = linear->nb_inputs;
-   N = linear->nb_outputs;
-   if (linear->float_weights != NULL) {
-     if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
-     else sgemv(out, linear->float_weights, N, M, N, in);
-   } else if (linear->weights != NULL) {
-     if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
-     else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
-     /* Only use SU biases on for integer matrices on SU archs. */
-#ifdef USE_SU_BIAS
-     bias = linear->subias;
-#endif
-   }
-   else OPUS_CLEAR(out, N);
-   if (bias != NULL) {
-      for (i=0;i<N;i++) out[i] += bias[i];
-   }
-   if (linear->diag) {
-      /* Diag is only used for GRU recurrent weights. */
-      celt_assert(3*M == N);
-      for (i=0;i<M;i++) {
-         out[i] += linear->diag[i]*in[i];
-         out[i+M] += linear->diag[i+M]*in[i];
-         out[i+2*M] += linear->diag[i+2*M]*in[i];
-      }
-   }
-}
-
-void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation)
-{
-   compute_linear(layer, output, input);
+   compute_linear(layer, output, input, arch);
    compute_activation(output, output, layer->nb_outputs, activation);
 }
 
 #define MAX_RNN_NEURONS_ALL IMAX(IMAX(FARGAN_MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS)
 
 
-void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in)
+void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch)
 {
   int i;
   int N;
@@ -129,8 +95,8 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
   h = &zrh[2*N];
   celt_assert(recurrent_weights->nb_outputs <= 3*MAX_RNN_NEURONS_ALL);
   celt_assert(in != state);
-  compute_linear(input_weights, zrh, in);
-  compute_linear(recurrent_weights, recur, state);
+  compute_linear(input_weights, zrh, in, arch);
+  compute_linear(recurrent_weights, recur, state, arch);
   for (i=0;i<2*N;i++)
      zrh[i] += recur[i];
   compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
@@ -143,12 +109,12 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
      state[i] = h[i];
 }
 
-void compute_glu(const LinearLayer *layer, float *output, const float *input)
+void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch)
 {
    int i;
    float act2[MAX_INPUTS];
    celt_assert(layer->nb_inputs == layer->nb_outputs);
-   compute_linear(layer, act2, input);
+   compute_linear(layer, act2, input, arch);
    compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID);
    if (input == output) {
      /* Give a vectorization hint to the compiler for the in-place case. */
@@ -194,7 +160,7 @@ void compute_activation(float *output, const float *input, int N, int activation
    }
 }
 
-void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input)
+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch)
 {
    LinearLayer matrix;
    celt_assert(input != output);
@@ -207,7 +173,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
    matrix.nb_inputs = layer->nb_inputs;
    matrix.nb_outputs = layer->nb_neurons;
    matrix.scale = NULL;
-   compute_linear(&matrix, output, input);
+   compute_linear(&matrix, output, input, arch);
    compute_activation(output, output, layer->nb_neurons, layer->activation);
 }
 
@@ -218,7 +184,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
 #endif
 #define MAX_IDX_SIZE 8192
 
-void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input)
+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch)
 {
   LinearLayer in_matrix, rec_matrix;
   int i, M, N;
@@ -262,25 +228,25 @@ void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *stat
   rec_matrix.float_weights = NULL;
 #endif
   rec_matrix.weights_idx = NULL;
-  compute_generic_gru(&in_matrix, &rec_matrix, state, input);
+  compute_generic_gru(&in_matrix, &rec_matrix, state, input, arch);
 }
 
 
 #define MAX_CONV_INPUTS_ALL DRED_MAX_CONV_INPUTS
 
-void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation)
+void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch)
 {
    float tmp[MAX_CONV_INPUTS_ALL];
    celt_assert(input != output);
    celt_assert(layer->nb_inputs <= MAX_CONV_INPUTS_ALL);
    OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
    OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
-   compute_linear(layer, output, tmp);
+   compute_linear(layer, output, tmp, arch);
    compute_activation(output, output, layer->nb_outputs, activation);
    OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
 }
 
-void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation)
+void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch)
 {
    float tmp[MAX_CONV_INPUTS_ALL];
    int ksize = layer->nb_inputs/input_size;
@@ -290,7 +256,7 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl
    if (dilation==1) OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
    else for (i=0;i<ksize-1;i++) OPUS_COPY(&tmp[i*input_size], &mem[i*input_size*dilation], input_size);
    OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
-   compute_linear(layer, output, tmp);
+   compute_linear(layer, output, tmp, arch);
    compute_activation(output, output, layer->nb_outputs, activation);
    if (dilation==1) OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
    else {
diff --git a/dnn/nnet.h b/dnn/nnet.h
index 64b59d66..c8240ffc 100644
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -126,18 +126,18 @@ typedef struct {
   int dim;
 } EmbeddingLayer;
 
-void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation);
-void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in);
-void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation);
-void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation);
-void compute_glu(const LinearLayer *layer, float *output, const float *input);
-void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation);
+void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch);
+void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch);
+void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch);
+void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch);
+void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch);
+void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation, int arch);
 
 void compute_activation(float *output, const float *input, int N, int activation);
 
-void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input);
+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch);
 
-void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input);
+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch);
 
 
 
@@ -189,4 +189,25 @@ int gru_init(GRULayer *layer, const WeightArray *arrays,
 void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);
 
 
+
+void compute_linear_c(const LinearLayer *linear, float *out, const float *in);
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+#include "x86/dnn_x86.h"
+#endif
+
+#ifndef OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_c(linear, out, in))
+#endif
+
+#if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
+#if defined(_MSC_VER)
+#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
+#else
+#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
+#endif
+#endif
+
+
+
 #endif /* NNET_H_ */
diff --git a/dnn/nnet_arch.h b/dnn/nnet_arch.h
new file mode 100644
index 00000000..00198579
--- /dev/null
+++ b/dnn/nnet_arch.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef NNET_ARCH_H
+#define NNET_ARCH_H
+
+#include "nnet.h"
+#include "arch.h"
+#include "os_support.h"
+#include "vec.h"
+
+#define CAT_SUFFIX2(a,b) a ## b
+#define CAT_SUFFIX(a,b) CAT_SUFFIX2(a, b)
+
+#define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH)
+
+void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in)
+{
+   int i, M, N;
+   const float *bias;
+   celt_assert(in != out);
+   bias = linear->bias;
+   M = linear->nb_inputs;
+   N = linear->nb_outputs;
+   if (linear->float_weights != NULL) {
+     if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
+     else sgemv(out, linear->float_weights, N, M, N, in);
+   } else if (linear->weights != NULL) {
+     if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
+     else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
+     /* Only use SU biases on for integer matrices on SU archs. */
+#ifdef USE_SU_BIAS
+     bias = linear->subias;
+#endif
+   }
+   else OPUS_CLEAR(out, N);
+   if (bias != NULL) {
+      for (i=0;i<N;i++) out[i] += bias[i];
+   }
+   if (linear->diag) {
+      /* Diag is only used for GRU recurrent weights. */
+      celt_assert(3*M == N);
+      for (i=0;i<M;i++) {
+         out[i] += linear->diag[i]*in[i];
+         out[i+M] += linear->diag[i+M]*in[i];
+         out[i+2*M] += linear->diag[i+2*M]*in[i];
+      }
+   }
+}
+
+
+#endif
diff --git a/dnn/nnet_default.c b/dnn/nnet_default.c
new file mode 100644
index 00000000..4316f0fb
--- /dev/null
+++ b/dnn/nnet_default.c
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#define RTCD_ARCH c
+
+#include "nnet_arch.h"
diff --git a/dnn/pitchdnn.c b/dnn/pitchdnn.c
index 5bb3a57c..1ca15dc6 100644
--- a/dnn/pitchdnn.c
+++ b/dnn/pitchdnn.c
@@ -12,7 +12,8 @@
 float compute_pitchdnn(
     PitchDNNState *st,
     const float *if_features,
-    const float *xcorr_features
+    const float *xcorr_features,
+    int arch
     )
 {
   float if1_out[DENSE_IF_UPSAMPLER_1_OUT_SIZE];
@@ -28,16 +29,16 @@ float compute_pitchdnn(
   float count=0;
   PitchDNN *model = &st->model;
   /* IF */
-  compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH);
-  compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH);
+  compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH, arch);
+  compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH, arch);
   /* xcorr*/
   OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES);
   compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH);
   compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH);
 
-  compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH);
-  compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out);
-  compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR);
+  compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH, arch);
+  compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out, arch);
+  compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR, arch);
   for (i=0;i<180;i++) {
     if (output[i] > maxval) {
       pos = i;
@@ -65,7 +66,6 @@ void pitchdnn_init(PitchDNNState *st)
   ret = 0;
 #endif
   celt_assert(ret == 0);
-  /* FIXME: perform arch detection. */
 }
 
 int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len) {
diff --git a/dnn/pitchdnn.h b/dnn/pitchdnn.h
index cdc4eb16..ed821412 100644
--- a/dnn/pitchdnn.h
+++ b/dnn/pitchdnn.h
@@ -27,7 +27,8 @@ int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len);
 float compute_pitchdnn(
     PitchDNNState *st,
     const float *if_features,
-    const float *xcorr_features
+    const float *xcorr_features,
+    int arch
     );
 
 #endif
diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h
index f0625158..73a55a22 100644
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -655,11 +655,6 @@ static inline mm256i_emu opus_mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a,
   return res;
 }
 
-#if defined(_MSC_VER)
-#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
-#else
-#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
-#endif
 
 #else
 
diff --git a/dnn/x86/dnn_x86.h b/dnn/x86/dnn_x86.h
new file mode 100644
index 00000000..c0f2ffae
--- /dev/null
+++ b/dnn/x86/dnn_x86.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2011-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef DNN_X86_H
+#define DNN_X86_H
+
+#include "cpu_support.h"
+#include "opus_types.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_AVX2)
+void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in);
+#endif
+
+
+#if defined(OPUS_X86_PRESUME_AVX2)
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in))
+
+#elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in))
+
+#elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in))
+
+#elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
+
+extern void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
+                    const LinearLayer *linear,
+                    float *out,
+                    const float *in
+                    );
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) \
+    ((*DNN_COMPUTE_LINEAR_IMPL[(arch) & OPUS_ARCHMASK])(linear, out, in))
+
+#endif
+
+
+
+#endif /* DNN_X86_H */
diff --git a/dnn/x86/nnet_avx2.c b/dnn/x86/nnet_avx2.c
new file mode 100644
index 00000000..f463b324
--- /dev/null
+++ b/dnn/x86/nnet_avx2.c
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __AVX2__
+#error nnet_avx2.c is being compiled without AVX2 enabled
+#endif
+
+#define RTCD_ARCH avx2
+
+#include "nnet_arch.h"
diff --git a/dnn/x86/nnet_sse2.c b/dnn/x86/nnet_sse2.c
new file mode 100644
index 00000000..bcee5ccc
--- /dev/null
+++ b/dnn/x86/nnet_sse2.c
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __SSE2__
+#error nnet_sse2.c is being compiled without SSE2 enabled
+#endif
+
+#define RTCD_ARCH sse2
+
+#include "nnet_arch.h"
diff --git a/dnn/x86/nnet_sse4_1.c b/dnn/x86/nnet_sse4_1.c
new file mode 100644
index 00000000..4b530b65
--- /dev/null
+++ b/dnn/x86/nnet_sse4_1.c
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __SSE4_1__
+#error nnet_sse4_1.c is being compiled without SSE4.1 enabled
+#endif
+
+#define RTCD_ARCH sse4_1
+
+#include "nnet_arch.h"
diff --git a/dnn/x86/x86_dnn_map.c b/dnn/x86/x86_dnn_map.c
new file mode 100644
index 00000000..35e061ff
--- /dev/null
+++ b/dnn/x86/x86_dnn_map.c
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "x86/x86cpu.h"
+#include "nnet.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_AVX2))
+
+void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
+         const LinearLayer *linear,
+         float *out,
+         const float *in
+) = {
+  compute_linear_c,                /* non-sse */
+  compute_linear_c,
+  MAY_HAVE_SSE2(compute_linear),
+  MAY_HAVE_SSE4_1(compute_linear), /* sse4.1  */
+  MAY_HAVE_AVX2(compute_linear)  /* avx  */
+};
+
+#endif
+
+
+#endif
diff --git a/lpcnet_headers.mk b/lpcnet_headers.mk
index be8cf301..d3aa1516 100644
--- a/lpcnet_headers.mk
+++ b/lpcnet_headers.mk
@@ -12,7 +12,9 @@ dnn/vec.h \
 dnn/vec_avx.h \
 dnn/vec_neon.h \
 dnn/pitchdnn.h \
-dnn/pitchdnn_data.h
+dnn/pitchdnn_data.h \
+dnn/x86/dnn_x86.h \
+dnn/nnet_arch.h
 
 DRED_HEAD = \
 silk/dred_coding.h \
diff --git a/lpcnet_sources.mk b/lpcnet_sources.mk
index 09b8b462..ee3d79fd 100644
--- a/lpcnet_sources.mk
+++ b/lpcnet_sources.mk
@@ -7,6 +7,7 @@ dnn/lpcnet_enc.c \
 dnn/lpcnet_plc.c \
 dnn/lpcnet_tables.c \
 dnn/nnet.c \
+dnn/nnet_default.c \
 dnn/plc_data.c \
 dnn/parse_lpcnet_weights.c \
 dnn/pitchdnn.c \
@@ -21,3 +22,8 @@ dnn/dred_rdovae_stats_data.c \
 silk/dred_encoder.c \
 silk/dred_coding.c \
 silk/dred_decoder.c
+
+DNN_SOURCES_X86_RTCD = dnn/x86/x86_dnn_map.c
+DNN_SOURCES_AVX2 = dnn/x86/nnet_avx2.c
+DNN_SOURCES_SSE4_1 = dnn/x86/nnet_sse4_1.c
+DNN_SOURCES_SSE2 = dnn/x86/nnet_sse2.c
diff --git a/silk/dred_encoder.c b/silk/dred_encoder.c
index b567a223..64ff2c7c 100644
--- a/silk/dred_encoder.c
+++ b/silk/dred_encoder.c
@@ -87,7 +87,7 @@ void dred_encoder_init(DREDEnc* enc, opus_int32 Fs, int channels)
     dred_encoder_reset(enc);
 }
 
-static void dred_process_frame(DREDEnc *enc)
+static void dred_process_frame(DREDEnc *enc, int arch)
 {
     float feature_buffer[2 * 36];
     float input_buffer[2*DRED_NUM_FEATURES] = {0};
@@ -97,15 +97,15 @@ static void dred_process_frame(DREDEnc *enc)
     OPUS_MOVE(enc->latents_buffer + DRED_LATENT_DIM, enc->latents_buffer, (DRED_MAX_FRAMES - 1) * DRED_LATENT_DIM);
 
     /* calculate LPCNet features */
-    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer);
-    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36);
+    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer, arch);
+    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36, arch);
 
     /* prepare input buffer (discard LPC coefficients) */
     OPUS_COPY(input_buffer, feature_buffer, DRED_NUM_FEATURES);
     OPUS_COPY(input_buffer + DRED_NUM_FEATURES, feature_buffer + 36, DRED_NUM_FEATURES);
 
     /* run RDOVAE encoder */
-    dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer);
+    dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer, arch);
     enc->latents_buffer_fill = IMIN(enc->latents_buffer_fill+1, DRED_NUM_REDUNDANCY_FRAMES);
 }
 
@@ -188,7 +188,7 @@ static void dred_convert_to_16k(DREDEnc *enc, const float *in, int in_len, float
     }
 }
 
-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay)
+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch)
 {
     int curr_offset16k;
     int frame_size16k = frame_size * 16000 / enc->Fs;
@@ -206,7 +206,7 @@ void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int ex
         if (enc->input_buffer_fill >= 2*DRED_FRAME_SIZE)
         {
             curr_offset16k += 320;
-            dred_process_frame(enc);
+            dred_process_frame(enc, arch);
             enc->input_buffer_fill -= 2*DRED_FRAME_SIZE;
             OPUS_MOVE(&enc->input_buffer[0], &enc->input_buffer[2*DRED_FRAME_SIZE], enc->input_buffer_fill);
             /* 15 ms (6*2.5 ms) is the ideal offset for DRED because it corresponds to our vocoder look-ahead. */
diff --git a/silk/dred_encoder.h b/silk/dred_encoder.h
index abeaac7f..d1d2376d 100644
--- a/silk/dred_encoder.h
+++ b/silk/dred_encoder.h
@@ -64,7 +64,7 @@ void dred_encoder_reset(DREDEnc* enc);
 
 void dred_deinit_encoder(DREDEnc *enc);
 
-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay);
+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch);
 
 int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes);
 
diff --git a/src/opus_decoder.c b/src/opus_decoder.c
index 73be6f3b..1e0a1da4 100644
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -1424,7 +1424,7 @@ int opus_dred_process(OpusDREDDecoder *dred_dec, const OpusDRED *src, OpusDRED *
       OPUS_COPY(dst, src, 1);
    if (dst->process_stage == 2)
       return OPUS_OK;
-   DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents);
+   DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents, dred_dec->arch);
    dst->process_stage = 2;
    return OPUS_OK;
 #else
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 27b3196a..28da18af 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -1715,7 +1715,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 #ifdef ENABLE_DRED
     if ( st->dred_duration > 0 && st->dred_encoder.loaded ) {
         /* DRED Encoder */
-        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer );
+        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer, st->arch );
     } else {
         st->dred_encoder.latents_buffer_fill = 0;
     }
author	Jean-Marc Valin <jmvalin@amazon.com>	2023-11-14 02:26:31 +0300
committer	Jean-Marc Valin <jmvalin@amazon.com>	2023-11-16 07:45:32 +0300
commit	2e034f6f312d752440b9e26afa82b0752c34d97b (patch)
tree	b3d5e0785b3a538517f234c94ba1c0b4fdcecc47
parent	b0620c0bf9864d9b18ead6b4bb6e0800542a931d (diff)