Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@amazon.com>2023-11-14 02:26:31 +0300
committerJean-Marc Valin <jmvalin@amazon.com>2023-11-16 07:45:32 +0300
commit2e034f6f312d752440b9e26afa82b0752c34d97b (patch)
treeb3d5e0785b3a538517f234c94ba1c0b4fdcecc47
parentb0620c0bf9864d9b18ead6b4bb6e0800542a931d (diff)
Adding RTCD for DNN code
Starting with compute_linear()
-rw-r--r--Makefile.am19
-rw-r--r--celt/x86/x86cpu.h2
-rw-r--r--dnn/dred_rdovae_dec.c51
-rw-r--r--dnn/dred_rdovae_dec.h6
-rw-r--r--dnn/dred_rdovae_enc.c31
-rw-r--r--dnn/dred_rdovae_enc.h2
-rw-r--r--dnn/dump_data.c5
-rw-r--r--dnn/fargan.c35
-rw-r--r--dnn/lpcnet.h4
-rw-r--r--dnn/lpcnet_demo.c5
-rw-r--r--dnn/lpcnet_enc.c20
-rw-r--r--dnn/lpcnet_plc.c12
-rw-r--r--dnn/lpcnet_private.h4
-rw-r--r--dnn/nnet.c64
-rw-r--r--dnn/nnet.h37
-rw-r--r--dnn/nnet_arch.h76
-rw-r--r--dnn/nnet_default.c35
-rw-r--r--dnn/pitchdnn.c14
-rw-r--r--dnn/pitchdnn.h3
-rw-r--r--dnn/vec_avx.h5
-rw-r--r--dnn/x86/dnn_x86.h78
-rw-r--r--dnn/x86/nnet_avx2.c38
-rw-r--r--dnn/x86/nnet_sse2.c38
-rw-r--r--dnn/x86/nnet_sse4_1.c38
-rw-r--r--dnn/x86/x86_dnn_map.c54
-rw-r--r--lpcnet_headers.mk4
-rw-r--r--lpcnet_sources.mk6
-rw-r--r--silk/dred_encoder.c12
-rw-r--r--silk/dred_encoder.h2
-rw-r--r--src/opus_decoder.c2
-rw-r--r--src/opus_encoder.c2
31 files changed, 539 insertions, 165 deletions
diff --git a/Makefile.am b/Makefile.am
index 452f6d22..046d3069 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -50,18 +50,30 @@ if CPU_X86
if HAVE_RTCD
CELT_SOURCES += $(CELT_SOURCES_X86_RTCD)
SILK_SOURCES += $(SILK_SOURCES_X86_RTCD)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_X86_RTCD)
+endif
endif
if HAVE_SSE
CELT_SOURCES += $(CELT_SOURCES_SSE)
endif
if HAVE_SSE2
CELT_SOURCES += $(CELT_SOURCES_SSE2)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_SSE2)
+endif
endif
if HAVE_SSE4_1
CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_SSE4_1)
+endif
endif
if HAVE_AVX2
CELT_SOURCES += $(CELT_SOURCES_AVX2)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_AVX2)
+endif
endif
endif
@@ -398,19 +410,22 @@ $(SSE_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS)
endif
if HAVE_SSE2
-SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo)
+SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) \
+ $(DNN_SOURCES_SSE2:.c=.lo)
$(SSE2_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS)
endif
if HAVE_SSE4_1
SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \
+ $(DNN_SOURCES_SSE4_1:.c=.lo) \
$(SILK_SOURCES_SSE4_1:.c=.lo) \
$(SILK_SOURCES_FIXED_SSE4_1:.c=.lo)
$(SSE4_1_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
endif
if HAVE_AVX2
-AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo)
+AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) \
+ $(DNN_SOURCES_AVX2:.c=.lo)
$(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS)
endif
diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
index fe46d1d9..6ce10e60 100644
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@@ -47,7 +47,7 @@
# endif
# if defined(OPUS_X86_MAY_HAVE_AVX2)
-# define MAY_HAVE_AVX2(name) name ## _avx
+# define MAY_HAVE_AVX2(name) name ## _avx2
# else
# define MAY_HAVE_AVX2(name) name ## _c
# endif
diff --git a/dnn/dred_rdovae_dec.c b/dnn/dred_rdovae_dec.c
index e2b19b14..7797ee77 100644
--- a/dnn/dred_rdovae_dec.c
+++ b/dnn/dred_rdovae_dec.c
@@ -42,33 +42,35 @@ static void conv1_cond_init(float *mem, int len, int dilation, int *init)
*init = 1;
}
-void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents)
+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch)
{
int i;
RDOVAEDecState dec;
memset(&dec, 0, sizeof(dec));
- dred_rdovae_dec_init_states(&dec, model, state);
+ dred_rdovae_dec_init_states(&dec, model, state, arch);
for (i = 0; i < 2*nb_latents; i += 2)
{
dred_rdovae_decode_qframe(
&dec,
model,
&features[2*i*DRED_NUM_FEATURES],
- &latents[(i/2)*DRED_LATENT_DIM]);
+ &latents[(i/2)*DRED_LATENT_DIM],
+ arch);
}
}
void dred_rdovae_dec_init_states(
RDOVAEDecState *h, /* io: state buffer handle */
const RDOVAEDec *model,
- const float *initial_state /* i: initial state */
+ const float *initial_state, /* i: initial state */
+ int arch
)
{
float hidden[DEC_HIDDEN_INIT_OUT_SIZE];
float state_init[DEC_GRU1_STATE_SIZE+DEC_GRU2_STATE_SIZE+DEC_GRU3_STATE_SIZE+DEC_GRU4_STATE_SIZE+DEC_GRU5_STATE_SIZE];
int counter=0;
- compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH);
- compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH);
+ compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH, arch);
+ compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH, arch);
OPUS_COPY(h->gru1_state, state_init, DEC_GRU1_STATE_SIZE);
counter += DEC_GRU1_STATE_SIZE;
OPUS_COPY(h->gru2_state, &state_init[counter], DEC_GRU2_STATE_SIZE);
@@ -86,7 +88,8 @@ void dred_rdovae_decode_qframe(
RDOVAEDecState *dec_state, /* io: state buffer handle */
const RDOVAEDec *model,
float *qframe, /* o: quadruple feature frame (four concatenated frames in reverse order) */
- const float *input /* i: latent vector */
+ const float *input, /* i: latent vector */
+ int arch
)
{
float buffer[DEC_DENSE1_OUT_SIZE + DEC_GRU1_OUT_SIZE + DEC_GRU2_OUT_SIZE + DEC_GRU3_OUT_SIZE + DEC_GRU4_OUT_SIZE + DEC_GRU5_OUT_SIZE
@@ -94,43 +97,43 @@ void dred_rdovae_decode_qframe(
int output_index = 0;
/* run encoder stack and concatenate output in buffer*/
- compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH);
+ compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);
output_index += DEC_DENSE1_OUT_SIZE;
- compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer);
- compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state);
+ compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer, arch);
+ compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state, arch);
output_index += DEC_GRU1_OUT_SIZE;
conv1_cond_init(dec_state->conv1_state, output_index, 1, &dec_state->initialized);
- compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH);
+ compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += DEC_CONV1_OUT_SIZE;
- compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer);
- compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state);
+ compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer, arch);
+ compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state, arch);
output_index += DEC_GRU2_OUT_SIZE;
conv1_cond_init(dec_state->conv2_state, output_index, 1, &dec_state->initialized);
- compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH);
+ compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += DEC_CONV2_OUT_SIZE;
- compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer);
- compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state);
+ compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer, arch);
+ compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state, arch);
output_index += DEC_GRU3_OUT_SIZE;
conv1_cond_init(dec_state->conv3_state, output_index, 1, &dec_state->initialized);
- compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH);
+ compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += DEC_CONV3_OUT_SIZE;
- compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer);
- compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state);
+ compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer, arch);
+ compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state, arch);
output_index += DEC_GRU4_OUT_SIZE;
conv1_cond_init(dec_state->conv4_state, output_index, 1, &dec_state->initialized);
- compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH);
+ compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += DEC_CONV4_OUT_SIZE;
- compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer);
- compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state);
+ compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer, arch);
+ compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state, arch);
output_index += DEC_GRU5_OUT_SIZE;
conv1_cond_init(dec_state->conv5_state, output_index, 1, &dec_state->initialized);
- compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH);
+ compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += DEC_CONV5_OUT_SIZE;
- compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR);
+ compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR, arch);
}
diff --git a/dnn/dred_rdovae_dec.h b/dnn/dred_rdovae_dec.h
index 636f0ee0..4e66911c 100644
--- a/dnn/dred_rdovae_dec.h
+++ b/dnn/dred_rdovae_dec.h
@@ -46,8 +46,8 @@ struct RDOVAEDecStruct {
float conv5_state[DEC_CONV5_STATE_SIZE];
};
-void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state);
-void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z);
-void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents);
+void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state, int arch);
+void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z, int arch);
+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch);
#endif
diff --git a/dnn/dred_rdovae_enc.c b/dnn/dred_rdovae_enc.c
index e159e632..4f13ae21 100644
--- a/dnn/dred_rdovae_enc.c
+++ b/dnn/dred_rdovae_enc.c
@@ -50,7 +50,8 @@ void dred_rdovae_encode_dframe(
const RDOVAEEnc *model,
float *latents, /* o: latent vector */
float *initial_state, /* o: initial state */
- const float *input /* i: double feature frame (concatenated) */
+ const float *input, /* i: double feature frame (concatenated) */
+ int arch
)
{
float padded_latents[DRED_PADDED_LATENT_DIM];
@@ -61,49 +62,49 @@ void dred_rdovae_encode_dframe(
int output_index = 0;
/* run encoder stack and concatenate output in buffer*/
- compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH);
+ compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);
output_index += ENC_DENSE1_OUT_SIZE;
- compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer);
+ compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer, arch);
OPUS_COPY(&buffer[output_index], enc_state->gru1_state, ENC_GRU1_OUT_SIZE);
output_index += ENC_GRU1_OUT_SIZE;
conv1_cond_init(enc_state->conv1_state, output_index, 1, &enc_state->initialized);
- compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH);
+ compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);
output_index += ENC_CONV1_OUT_SIZE;
- compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer);
+ compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer, arch);
OPUS_COPY(&buffer[output_index], enc_state->gru2_state, ENC_GRU2_OUT_SIZE);
output_index += ENC_GRU2_OUT_SIZE;
conv1_cond_init(enc_state->conv2_state, output_index, 2, &enc_state->initialized);
- compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH);
+ compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
output_index += ENC_CONV2_OUT_SIZE;
- compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer);
+ compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer, arch);
OPUS_COPY(&buffer[output_index], enc_state->gru3_state, ENC_GRU3_OUT_SIZE);
output_index += ENC_GRU3_OUT_SIZE;
conv1_cond_init(enc_state->conv3_state, output_index, 2, &enc_state->initialized);
- compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH);
+ compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
output_index += ENC_CONV3_OUT_SIZE;
- compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer);
+ compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer, arch);
OPUS_COPY(&buffer[output_index], enc_state->gru4_state, ENC_GRU4_OUT_SIZE);
output_index += ENC_GRU4_OUT_SIZE;
conv1_cond_init(enc_state->conv4_state, output_index, 2, &enc_state->initialized);
- compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH);
+ compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
output_index += ENC_CONV4_OUT_SIZE;
- compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer);
+ compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer, arch);
OPUS_COPY(&buffer[output_index], enc_state->gru5_state, ENC_GRU5_OUT_SIZE);
output_index += ENC_GRU5_OUT_SIZE;
conv1_cond_init(enc_state->conv5_state, output_index, 2, &enc_state->initialized);
- compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH);
+ compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
output_index += ENC_CONV5_OUT_SIZE;
- compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR);
+ compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR, arch);
OPUS_COPY(latents, padded_latents, DRED_LATENT_DIM);
/* next, calculate initial state */
- compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH);
- compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR);
+ compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH, arch);
+ compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR, arch);
OPUS_COPY(initial_state, padded_state, DRED_STATE_DIM);
}
diff --git a/dnn/dred_rdovae_enc.h b/dnn/dred_rdovae_enc.h
index 43a4e8b2..6fe537ee 100644
--- a/dnn/dred_rdovae_enc.h
+++ b/dnn/dred_rdovae_enc.h
@@ -46,7 +46,7 @@ struct RDOVAEEncStruct {
float conv5_state[2*ENC_CONV5_STATE_SIZE];
};
-void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input);
+void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input, int arch);
#endif
diff --git a/dnn/dump_data.c b/dnn/dump_data.c
index be1ff16e..e7acfb11 100644
--- a/dnn/dump_data.c
+++ b/dnn/dump_data.c
@@ -42,6 +42,7 @@
#include "lpcnet.h"
#include "lpcnet_private.h"
#include "os_support.h"
+#include "cpu_support.h"
static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
@@ -135,7 +136,9 @@ int main(int argc, char **argv) {
FILE *fnoise = NULL;
float noise_gain = 0;
long noise_size=0;
+ int arch;
srand(getpid());
+ arch = opus_select_arch();
st = lpcnet_encoder_create();
argv0=argv[0];
if (argc == 5 && strcmp(argv[1], "-btrain")==0) {
@@ -244,7 +247,7 @@ int main(int argc, char **argv) {
for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
/* PCM is delayed by 1/2 frame to make the features centered on the frames. */
for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
- compute_frame_features(st, x);
+ compute_frame_features(st, x, arch);
if (fpcm) {
compute_noise(noisebuf, noise_std);
diff --git a/dnn/fargan.c b/dnn/fargan.c
index e0fa304c..5e01ebd0 100644
--- a/dnn/fargan.c
+++ b/dnn/fargan.c
@@ -36,6 +36,7 @@
#include "pitch.h"
#include "nnet.h"
#include "lpcnet_private.h"
+#include "cpu_support.h"
#define FARGAN_FEATURES (NB_FEATURES)
@@ -52,9 +53,9 @@ static void compute_fargan_cond(FARGANState *st, float *cond, const float *featu
OPUS_COPY(&dense_in[NB_FEATURES], &model->cond_net_pembed.float_weights[IMAX(0,IMIN(period-32, 224))*COND_NET_PEMBED_OUT_SIZE], COND_NET_PEMBED_OUT_SIZE);
OPUS_COPY(dense_in, features, NB_FEATURES);
- compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH);
- compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH);
- compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH);
+ compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH, st->arch);
+ compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH, st->arch);
+ compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH, st->arch);
}
static void fargan_deemphasis(float *pcm, float *deemph_mem) {
@@ -84,7 +85,7 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
celt_assert(st->cont_initialized);
model = &st->model;
- compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR);
+ compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR, st->arch);
gain = exp(gain);
gain_1 = 1.f/(1e-5f + gain);
@@ -100,26 +101,26 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE], pred, FARGAN_SUBFRAME_SIZE+4);
OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE+FARGAN_SUBFRAME_SIZE+4], prev, FARGAN_SUBFRAME_SIZE);
- compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH);
+ compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH, st->arch);
celt_assert(SIG_NET_FWC0_GLU_GATE_OUT_SIZE == model->sig_net_fwc0_glu_gate.nb_outputs);
- compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in);
+ compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in, st->arch);
- compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID);
+ compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID, st->arch);
for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+i] = pitch_gate[0]*pred[i+2];
OPUS_COPY(&gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
- compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in);
- compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state);
+ compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in, st->arch);
+ compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state, st->arch);
for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru2_in[SIG_NET_GRU1_OUT_SIZE+i] = pitch_gate[1]*pred[i+2];
OPUS_COPY(&gru2_in[SIG_NET_GRU1_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
- compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in);
- compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state);
+ compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in, st->arch);
+ compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state, st->arch);
for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru3_in[SIG_NET_GRU2_OUT_SIZE+i] = pitch_gate[2]*pred[i+2];
OPUS_COPY(&gru3_in[SIG_NET_GRU2_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
- compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in);
- compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state);
+ compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in, st->arch);
+ compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state, st->arch);
OPUS_COPY(skip_cat, gru2_in, SIG_NET_GRU1_OUT_SIZE);
OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE], gru3_in, SIG_NET_GRU2_OUT_SIZE);
@@ -127,10 +128,10 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+i] = pitch_gate[3]*pred[i+2];
OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
- compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH);
- compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out);
+ compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH, st->arch);
+ compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out, st->arch);
- compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH);
+ compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH, st->arch);
for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) pcm[i] *= gain;
OPUS_MOVE(st->pitch_buf, &st->pitch_buf[FARGAN_SUBFRAME_SIZE], PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE);
@@ -174,13 +175,13 @@ void fargan_init(FARGANState *st)
{
int ret;
OPUS_CLEAR(st, 1);
+ st->arch = opus_select_arch();
#ifndef USE_WEIGHTS_FILE
ret = init_fargan(&st->model, fargan_arrays);
#else
ret = 0;
#endif
celt_assert(ret == 0);
- /* FIXME: perform arch detection. */
}
int fargan_load_model(FARGANState *st, const unsigned char *data, int len) {
diff --git a/dnn/lpcnet.h b/dnn/lpcnet.h
index adcba515..ec39dc24 100644
--- a/dnn/lpcnet.h
+++ b/dnn/lpcnet.h
@@ -120,7 +120,7 @@ int lpcnet_encode(LPCNetEncState *st, const opus_int16 *pcm, unsigned char *buf)
* @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
* @retval 0 Success
*/
-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);
+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch);
/** Compute features on LPCNET_FRAME_SIZE speech samples (currently 160) and output features for one 10-ms frame.
@@ -129,7 +129,7 @@ int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *p
* @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
* @retval 0 Success
*/
-int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]);
+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch);
/** Gets the size of an <code>LPCNetState</code> structure.
* @returns The size in bytes.
diff --git a/dnn/lpcnet_demo.c b/dnn/lpcnet_demo.c
index cfa9f6fd..aad31190 100644
--- a/dnn/lpcnet_demo.c
+++ b/dnn/lpcnet_demo.c
@@ -37,6 +37,7 @@
#include "freq.h"
#include "os_support.h"
#include "fargan.h"
+#include "cpu_support.h"
#ifdef USE_WEIGHTS_FILE
# if __unix__
@@ -99,12 +100,14 @@ void usage(void) {
int main(int argc, char **argv) {
int mode=0;
+ int arch;
FILE *fin, *fout;
#ifdef USE_WEIGHTS_FILE
int len;
unsigned char *data;
const char *filename = "weights_blob.bin";
#endif
+ arch = opus_select_arch();
if (argc < 4) usage();
if (strcmp(argv[1], "-features") == 0) mode=MODE_FEATURES;
else if (strcmp(argv[1], "-fargan-synthesis") == 0) mode=MODE_FARGAN_SYNTHESIS;
@@ -137,7 +140,7 @@ int main(int argc, char **argv) {
size_t ret;
ret = fread(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fin);
if (feof(fin) || ret != LPCNET_FRAME_SIZE) break;
- lpcnet_compute_single_frame_features(net, pcm, features);
+ lpcnet_compute_single_frame_features(net, pcm, features, arch);
fwrite(features, sizeof(float), NB_TOTAL_FEATURES, fout);
}
lpcnet_encoder_destroy(net);
diff --git a/dnn/lpcnet_enc.c b/dnn/lpcnet_enc.c
index c2c5578b..8e3164df 100644
--- a/dnn/lpcnet_enc.c
+++ b/dnn/lpcnet_enc.c
@@ -95,7 +95,7 @@ static void biquad(float *y, float mem[2], const float *x, const float *b, const
#define celt_log10(x) (0.3010299957f*celt_log2(x))
-void compute_frame_features(LPCNetEncState *st, const float *in) {
+void compute_frame_features(LPCNetEncState *st, const float *in, int arch) {
float aligned_in[FRAME_SIZE];
int i;
float Ly[NB_BANDS];
@@ -142,7 +142,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
OPUS_COPY(&x[0], st->pitch_mem, LPC_ORDER);
OPUS_COPY(&x[LPC_ORDER], aligned_in, FRAME_SIZE);
OPUS_COPY(st->pitch_mem, &aligned_in[FRAME_SIZE-LPC_ORDER], LPC_ORDER);
- celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, st->arch);
+ celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, arch);
for (i=0;i<FRAME_SIZE;i++) {
st->exc_buf[PITCH_MAX_PERIOD+i] = st->lp_buf[PITCH_MAX_PERIOD+i] + .7f*st->pitch_filt;
st->pitch_filt = st->lp_buf[PITCH_MAX_PERIOD+i];
@@ -152,7 +152,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
{
double ener1;
float *buf = st->exc_buf;
- celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, st->arch);
+ celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, arch);
ener0 = celt_inner_prod_c(&buf[PITCH_MAX_PERIOD], &buf[PITCH_MAX_PERIOD], FRAME_SIZE);
ener1 = celt_inner_prod_c(&buf[0], &buf[0], FRAME_SIZE-1);
/*printf("%f\n", st->frame_weight[sub]);*/
@@ -165,7 +165,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
}
/*printf("\n");*/
}
- st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features);
+ st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features, arch);
}
void process_single_frame(LPCNetEncState *st, FILE *ffeat) {
@@ -196,26 +196,26 @@ void preemphasis(float *y, float *mem, const float *x, float coef, int N) {
}
}
-static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES]) {
+static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES], int arch) {
preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
- compute_frame_features(st, x);
+ compute_frame_features(st, x, arch);
process_single_frame(st, NULL);
OPUS_COPY(features, &st->features[0], NB_TOTAL_FEATURES);
return 0;
}
-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]) {
+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch) {
int i;
float x[FRAME_SIZE];
for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
- lpcnet_compute_single_frame_features_impl(st, x, features);
+ lpcnet_compute_single_frame_features_impl(st, x, features, arch);
return 0;
}
-int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]) {
+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch) {
int i;
float x[FRAME_SIZE];
for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
- lpcnet_compute_single_frame_features_impl(st, x, features);
+ lpcnet_compute_single_frame_features_impl(st, x, features, arch);
return 0;
}
diff --git a/dnn/lpcnet_plc.c b/dnn/lpcnet_plc.c
index de3ab1a7..b713110f 100644
--- a/dnn/lpcnet_plc.c
+++ b/dnn/lpcnet_plc.c
@@ -33,6 +33,7 @@
#include "plc_data.h"
#include "os_support.h"
#include "common.h"
+#include "cpu_support.h"
#ifndef M_PI
#define M_PI 3.141592653
@@ -54,6 +55,7 @@ void lpcnet_plc_reset(LPCNetPLCState *st) {
int lpcnet_plc_init(LPCNetPLCState *st) {
int ret;
+ st->arch = opus_select_arch();
fargan_init(&st->fargan);
lpcnet_encoder_init(&st->enc);
st->analysis_pos = PLC_BUF_SIZE;
@@ -109,10 +111,10 @@ static void compute_plc_pred(LPCNetPLCState *st, float *out, const float *in) {
float dense_out[PLC_DENSE1_OUT_SIZE];
PLCNetState *net = &st->plc_net;
celt_assert(st->loaded);
- _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in);
- compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out);
- compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state);
- _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state);
+ _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in, st->arch);
+ compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out, st->arch);
+ compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state, st->arch);
+ _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state, st->arch);
}
static int get_fec_or_pred(LPCNetPLCState *st, float *out) {
@@ -164,7 +166,7 @@ int lpcnet_plc_conceal(LPCNetPLCState *st, opus_int16 *pcm) {
float plc_features[2*NB_BANDS+NB_FEATURES+1];
for (i=0;i<FRAME_SIZE;i++) x[i] = 32768.f*st->pcm[st->analysis_pos+i];
burg_cepstral_analysis(plc_features, x);
- lpcnet_compute_single_frame_features_float(&st->enc, x, st->features);
+ lpcnet_compute_single_frame_features_float(&st->enc, x, st->features, st->arch);
if ((st->analysis_gap && count > 0) || count > 1) {
queue_features(st, st->features);
OPUS_COPY(&plc_features[2*NB_BANDS], st->features, NB_FEATURES);
diff --git a/dnn/lpcnet_private.h b/dnn/lpcnet_private.h
index 30931b1d..9a68c718 100644
--- a/dnn/lpcnet_private.h
+++ b/dnn/lpcnet_private.h
@@ -24,7 +24,6 @@
struct LPCNetEncState{
PitchDNNState pitchdnn;
- int arch;
float analysis_mem[OVERLAP_SIZE];
float mem_preemph;
kiss_fft_cpx prev_if[PITCH_IF_MAX_FREQ];
@@ -67,7 +66,7 @@ struct LPCNetPLCState {
void preemphasis(float *y, float *mem, const float *x, float coef, int N);
-void compute_frame_features(LPCNetEncState *st, const float *in);
+void compute_frame_features(LPCNetEncState *st, const float *in, int arch);
void lpcnet_reset_signal(LPCNetState *lpcnet);
void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features);
@@ -79,7 +78,6 @@ void lpcnet_synthesize_tail_impl(LPCNetState *lpcnet, opus_int16 *output, int N,
void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, opus_int16 *output, int N, int preload);
void lpcnet_synthesize_blend_impl(LPCNetState *lpcnet, const opus_int16 *pcm_in, opus_int16 *output, int N);
void process_single_frame(LPCNetEncState *st, FILE *ffeat);
-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);
void process_single_frame(LPCNetEncState *st, FILE *ffeat);
diff --git a/dnn/nnet.c b/dnn/nnet.c
index c76e9f28..22fda89b 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -69,50 +69,16 @@ static OPUS_INLINE float relu(float x)
return x < 0 ? 0 : x;
}
-static void compute_linear(const LinearLayer *linear, float *out, const float *in)
+void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch)
{
- int i, M, N;
- const float *bias;
- celt_assert(in != out);
- bias = linear->bias;
- M = linear->nb_inputs;
- N = linear->nb_outputs;
- if (linear->float_weights != NULL) {
- if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
- else sgemv(out, linear->float_weights, N, M, N, in);
- } else if (linear->weights != NULL) {
- if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
- else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
- /* Only use SU biases on for integer matrices on SU archs. */
-#ifdef USE_SU_BIAS
- bias = linear->subias;
-#endif
- }
- else OPUS_CLEAR(out, N);
- if (bias != NULL) {
- for (i=0;i<N;i++) out[i] += bias[i];
- }
- if (linear->diag) {
- /* Diag is only used for GRU recurrent weights. */
- celt_assert(3*M == N);
- for (i=0;i<M;i++) {
- out[i] += linear->diag[i]*in[i];
- out[i+M] += linear->diag[i+M]*in[i];
- out[i+2*M] += linear->diag[i+2*M]*in[i];
- }
- }
-}
-
-void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation)
-{
- compute_linear(layer, output, input);
+ compute_linear(layer, output, input, arch);
compute_activation(output, output, layer->nb_outputs, activation);
}
#define MAX_RNN_NEURONS_ALL IMAX(IMAX(FARGAN_MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS)
-void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in)
+void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch)
{
int i;
int N;
@@ -129,8 +95,8 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
h = &zrh[2*N];
celt_assert(recurrent_weights->nb_outputs <= 3*MAX_RNN_NEURONS_ALL);
celt_assert(in != state);
- compute_linear(input_weights, zrh, in);
- compute_linear(recurrent_weights, recur, state);
+ compute_linear(input_weights, zrh, in, arch);
+ compute_linear(recurrent_weights, recur, state, arch);
for (i=0;i<2*N;i++)
zrh[i] += recur[i];
compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
@@ -143,12 +109,12 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
state[i] = h[i];
}
-void compute_glu(const LinearLayer *layer, float *output, const float *input)
+void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch)
{
int i;
float act2[MAX_INPUTS];
celt_assert(layer->nb_inputs == layer->nb_outputs);
- compute_linear(layer, act2, input);
+ compute_linear(layer, act2, input, arch);
compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID);
if (input == output) {
/* Give a vectorization hint to the compiler for the in-place case. */
@@ -194,7 +160,7 @@ void compute_activation(float *output, const float *input, int N, int activation
}
}
-void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input)
+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch)
{
LinearLayer matrix;
celt_assert(input != output);
@@ -207,7 +173,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
matrix.nb_inputs = layer->nb_inputs;
matrix.nb_outputs = layer->nb_neurons;
matrix.scale = NULL;
- compute_linear(&matrix, output, input);
+ compute_linear(&matrix, output, input, arch);
compute_activation(output, output, layer->nb_neurons, layer->activation);
}
@@ -218,7 +184,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
#endif
#define MAX_IDX_SIZE 8192
-void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input)
+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch)
{
LinearLayer in_matrix, rec_matrix;
int i, M, N;
@@ -262,25 +228,25 @@ void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *stat
rec_matrix.float_weights = NULL;
#endif
rec_matrix.weights_idx = NULL;
- compute_generic_gru(&in_matrix, &rec_matrix, state, input);
+ compute_generic_gru(&in_matrix, &rec_matrix, state, input, arch);
}
#define MAX_CONV_INPUTS_ALL DRED_MAX_CONV_INPUTS
-void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation)
+void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch)
{
float tmp[MAX_CONV_INPUTS_ALL];
celt_assert(input != output);
celt_assert(layer->nb_inputs <= MAX_CONV_INPUTS_ALL);
OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
- compute_linear(layer, output, tmp);
+ compute_linear(layer, output, tmp, arch);
compute_activation(output, output, layer->nb_outputs, activation);
OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
}
-void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation)
+void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch)
{
float tmp[MAX_CONV_INPUTS_ALL];
int ksize = layer->nb_inputs/input_size;
@@ -290,7 +256,7 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl
if (dilation==1) OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
else for (i=0;i<ksize-1;i++) OPUS_COPY(&tmp[i*input_size], &mem[i*input_size*dilation], input_size);
OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
- compute_linear(layer, output, tmp);
+ compute_linear(layer, output, tmp, arch);
compute_activation(output, output, layer->nb_outputs, activation);
if (dilation==1) OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
else {
diff --git a/dnn/nnet.h b/dnn/nnet.h
index 64b59d66..c8240ffc 100644
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -126,18 +126,18 @@ typedef struct {
int dim;
} EmbeddingLayer;
-void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation);
-void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in);
-void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation);
-void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation);
-void compute_glu(const LinearLayer *layer, float *output, const float *input);
-void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation);
+void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch);
+void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch);
+void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch);
+void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch);
+void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch);
+void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation, int arch);
void compute_activation(float *output, const float *input, int N, int activation);
-void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input);
+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch);
-void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input);
+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch);
@@ -189,4 +189,25 @@ int gru_init(GRULayer *layer, const WeightArray *arrays,
void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);
+
+void compute_linear_c(const LinearLayer *linear, float *out, const float *in);
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+#include "x86/dnn_x86.h"
+#endif
+
+#ifndef OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_c(linear, out, in))
+#endif
+
+#if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
+#if defined(_MSC_VER)
+#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
+#else
+#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
+#endif
+#endif
+
+
+
#endif /* NNET_H_ */
diff --git a/dnn/nnet_arch.h b/dnn/nnet_arch.h
new file mode 100644
index 00000000..00198579
--- /dev/null
+++ b/dnn/nnet_arch.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018-2019 Mozilla
+ 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef NNET_ARCH_H
+#define NNET_ARCH_H
+
+#include "nnet.h"
+#include "arch.h"
+#include "os_support.h"
+#include "vec.h"
+
+#define CAT_SUFFIX2(a,b) a ## b
+#define CAT_SUFFIX(a,b) CAT_SUFFIX2(a, b)
+
+#define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH)
+
+void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in)
+{
+ int i, M, N;
+ const float *bias;
+ celt_assert(in != out);
+ bias = linear->bias;
+ M = linear->nb_inputs;
+ N = linear->nb_outputs;
+ if (linear->float_weights != NULL) {
+ if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
+ else sgemv(out, linear->float_weights, N, M, N, in);
+ } else if (linear->weights != NULL) {
+ if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
+ else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
+ /* Only use SU biases on for integer matrices on SU archs. */
+#ifdef USE_SU_BIAS
+ bias = linear->subias;
+#endif
+ }
+ else OPUS_CLEAR(out, N);
+ if (bias != NULL) {
+ for (i=0;i<N;i++) out[i] += bias[i];
+ }
+ if (linear->diag) {
+ /* Diag is only used for GRU recurrent weights. */
+ celt_assert(3*M == N);
+ for (i=0;i<M;i++) {
+ out[i] += linear->diag[i]*in[i];
+ out[i+M] += linear->diag[i+M]*in[i];
+ out[i+2*M] += linear->diag[i+2*M]*in[i];
+ }
+ }
+}
+
+
+#endif
diff --git a/dnn/nnet_default.c b/dnn/nnet_default.c
new file mode 100644
index 00000000..4316f0fb
--- /dev/null
+++ b/dnn/nnet_default.c
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018-2019 Mozilla
+ 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#define RTCD_ARCH c
+
+#include "nnet_arch.h"
diff --git a/dnn/pitchdnn.c b/dnn/pitchdnn.c
index 5bb3a57c..1ca15dc6 100644
--- a/dnn/pitchdnn.c
+++ b/dnn/pitchdnn.c
@@ -12,7 +12,8 @@
float compute_pitchdnn(
PitchDNNState *st,
const float *if_features,
- const float *xcorr_features
+ const float *xcorr_features,
+ int arch
)
{
float if1_out[DENSE_IF_UPSAMPLER_1_OUT_SIZE];
@@ -28,16 +29,16 @@ float compute_pitchdnn(
float count=0;
PitchDNN *model = &st->model;
/* IF */
- compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH);
- compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH);
+ compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH, arch);
+ compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH, arch);
/* xcorr*/
OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES);
compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH);
compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH);
- compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH);
- compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out);
- compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR);
+ compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH, arch);
+ compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out, arch);
+ compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR, arch);
for (i=0;i<180;i++) {
if (output[i] > maxval) {
pos = i;
@@ -65,7 +66,6 @@ void pitchdnn_init(PitchDNNState *st)
ret = 0;
#endif
celt_assert(ret == 0);
- /* FIXME: perform arch detection. */
}
int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len) {
diff --git a/dnn/pitchdnn.h b/dnn/pitchdnn.h
index cdc4eb16..ed821412 100644
--- a/dnn/pitchdnn.h
+++ b/dnn/pitchdnn.h
@@ -27,7 +27,8 @@ int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len);
float compute_pitchdnn(
PitchDNNState *st,
const float *if_features,
- const float *xcorr_features
+ const float *xcorr_features,
+ int arch
);
#endif
diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h
index f0625158..73a55a22 100644
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -655,11 +655,6 @@ static inline mm256i_emu opus_mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a,
return res;
}
-#if defined(_MSC_VER)
-#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
-#else
-#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
-#endif
#else
diff --git a/dnn/x86/dnn_x86.h b/dnn/x86/dnn_x86.h
new file mode 100644
index 00000000..c0f2ffae
--- /dev/null
+++ b/dnn/x86/dnn_x86.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2011-2019 Mozilla
+ 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef DNN_X86_H
+#define DNN_X86_H
+
+#include "cpu_support.h"
+#include "opus_types.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_AVX2)
+void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in);
+#endif
+
+
+#if defined(OPUS_X86_PRESUME_AVX2)
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in))
+
+#elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in))
+
+#elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in))
+
+#elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
+
+extern void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
+ const LinearLayer *linear,
+ float *out,
+ const float *in
+ );
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) \
+ ((*DNN_COMPUTE_LINEAR_IMPL[(arch) & OPUS_ARCHMASK])(linear, out, in))
+
+#endif
+
+
+
+#endif /* DNN_X86_H */
diff --git a/dnn/x86/nnet_avx2.c b/dnn/x86/nnet_avx2.c
new file mode 100644
index 00000000..f463b324
--- /dev/null
+++ b/dnn/x86/nnet_avx2.c
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018-2019 Mozilla
+ 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __AVX2__
+#error nnet_avx2.c is being compiled without AVX2 enabled
+#endif
+
+#define RTCD_ARCH avx2
+
+#include "nnet_arch.h"
diff --git a/dnn/x86/nnet_sse2.c b/dnn/x86/nnet_sse2.c
new file mode 100644
index 00000000..bcee5ccc
--- /dev/null
+++ b/dnn/x86/nnet_sse2.c
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018-2019 Mozilla
+ 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __SSE2__
+#error nnet_sse2.c is being compiled without SSE2 enabled
+#endif
+
+#define RTCD_ARCH sse2
+
+#include "nnet_arch.h"
diff --git a/dnn/x86/nnet_sse4_1.c b/dnn/x86/nnet_sse4_1.c
new file mode 100644
index 00000000..4b530b65
--- /dev/null
+++ b/dnn/x86/nnet_sse4_1.c
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018-2019 Mozilla
+ 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __SSE4_1__
+#error nnet_sse4_1.c is being compiled without SSE4.1 enabled
+#endif
+
+#define RTCD_ARCH sse4_1
+
+#include "nnet_arch.h"
diff --git a/dnn/x86/x86_dnn_map.c b/dnn/x86/x86_dnn_map.c
new file mode 100644
index 00000000..35e061ff
--- /dev/null
+++ b/dnn/x86/x86_dnn_map.c
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018-2019 Mozilla
+ 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "x86/x86cpu.h"
+#include "nnet.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_AVX2))
+
+void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
+ const LinearLayer *linear,
+ float *out,
+ const float *in
+) = {
+ compute_linear_c, /* non-sse */
+ compute_linear_c,
+ MAY_HAVE_SSE2(compute_linear),
+ MAY_HAVE_SSE4_1(compute_linear), /* sse4.1 */
+ MAY_HAVE_AVX2(compute_linear) /* avx */
+};
+
+#endif
+
+
+#endif
diff --git a/lpcnet_headers.mk b/lpcnet_headers.mk
index be8cf301..d3aa1516 100644
--- a/lpcnet_headers.mk
+++ b/lpcnet_headers.mk
@@ -12,7 +12,9 @@ dnn/vec.h \
dnn/vec_avx.h \
dnn/vec_neon.h \
dnn/pitchdnn.h \
-dnn/pitchdnn_data.h
+dnn/pitchdnn_data.h \
+dnn/x86/dnn_x86.h \
+dnn/nnet_arch.h
DRED_HEAD = \
silk/dred_coding.h \
diff --git a/lpcnet_sources.mk b/lpcnet_sources.mk
index 09b8b462..ee3d79fd 100644
--- a/lpcnet_sources.mk
+++ b/lpcnet_sources.mk
@@ -7,6 +7,7 @@ dnn/lpcnet_enc.c \
dnn/lpcnet_plc.c \
dnn/lpcnet_tables.c \
dnn/nnet.c \
+dnn/nnet_default.c \
dnn/plc_data.c \
dnn/parse_lpcnet_weights.c \
dnn/pitchdnn.c \
@@ -21,3 +22,8 @@ dnn/dred_rdovae_stats_data.c \
silk/dred_encoder.c \
silk/dred_coding.c \
silk/dred_decoder.c
+
+DNN_SOURCES_X86_RTCD = dnn/x86/x86_dnn_map.c
+DNN_SOURCES_AVX2 = dnn/x86/nnet_avx2.c
+DNN_SOURCES_SSE4_1 = dnn/x86/nnet_sse4_1.c
+DNN_SOURCES_SSE2 = dnn/x86/nnet_sse2.c
diff --git a/silk/dred_encoder.c b/silk/dred_encoder.c
index b567a223..64ff2c7c 100644
--- a/silk/dred_encoder.c
+++ b/silk/dred_encoder.c
@@ -87,7 +87,7 @@ void dred_encoder_init(DREDEnc* enc, opus_int32 Fs, int channels)
dred_encoder_reset(enc);
}
-static void dred_process_frame(DREDEnc *enc)
+static void dred_process_frame(DREDEnc *enc, int arch)
{
float feature_buffer[2 * 36];
float input_buffer[2*DRED_NUM_FEATURES] = {0};
@@ -97,15 +97,15 @@ static void dred_process_frame(DREDEnc *enc)
OPUS_MOVE(enc->latents_buffer + DRED_LATENT_DIM, enc->latents_buffer, (DRED_MAX_FRAMES - 1) * DRED_LATENT_DIM);
/* calculate LPCNet features */
- lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer);
- lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36);
+ lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer, arch);
+ lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36, arch);
/* prepare input buffer (discard LPC coefficients) */
OPUS_COPY(input_buffer, feature_buffer, DRED_NUM_FEATURES);
OPUS_COPY(input_buffer + DRED_NUM_FEATURES, feature_buffer + 36, DRED_NUM_FEATURES);
/* run RDOVAE encoder */
- dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer);
+ dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer, arch);
enc->latents_buffer_fill = IMIN(enc->latents_buffer_fill+1, DRED_NUM_REDUNDANCY_FRAMES);
}
@@ -188,7 +188,7 @@ static void dred_convert_to_16k(DREDEnc *enc, const float *in, int in_len, float
}
}
-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay)
+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch)
{
int curr_offset16k;
int frame_size16k = frame_size * 16000 / enc->Fs;
@@ -206,7 +206,7 @@ void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int ex
if (enc->input_buffer_fill >= 2*DRED_FRAME_SIZE)
{
curr_offset16k += 320;
- dred_process_frame(enc);
+ dred_process_frame(enc, arch);
enc->input_buffer_fill -= 2*DRED_FRAME_SIZE;
OPUS_MOVE(&enc->input_buffer[0], &enc->input_buffer[2*DRED_FRAME_SIZE], enc->input_buffer_fill);
/* 15 ms (6*2.5 ms) is the ideal offset for DRED because it corresponds to our vocoder look-ahead. */
diff --git a/silk/dred_encoder.h b/silk/dred_encoder.h
index abeaac7f..d1d2376d 100644
--- a/silk/dred_encoder.h
+++ b/silk/dred_encoder.h
@@ -64,7 +64,7 @@ void dred_encoder_reset(DREDEnc* enc);
void dred_deinit_encoder(DREDEnc *enc);
-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay);
+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch);
int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes);
diff --git a/src/opus_decoder.c b/src/opus_decoder.c
index 73be6f3b..1e0a1da4 100644
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -1424,7 +1424,7 @@ int opus_dred_process(OpusDREDDecoder *dred_dec, const OpusDRED *src, OpusDRED *
OPUS_COPY(dst, src, 1);
if (dst->process_stage == 2)
return OPUS_OK;
- DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents);
+ DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents, dred_dec->arch);
dst->process_stage = 2;
return OPUS_OK;
#else
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 27b3196a..28da18af 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -1715,7 +1715,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
#ifdef ENABLE_DRED
if ( st->dred_duration > 0 && st->dred_encoder.loaded ) {
/* DRED Encoder */
- dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer );
+ dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer, st->arch );
} else {
st->dred_encoder.latents_buffer_fill = 0;
}