diff options
author | Jean-Marc Valin <jmvalin@amazon.com> | 2023-11-15 01:54:05 +0300 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@amazon.com> | 2023-11-16 07:46:01 +0300 |
commit | 4bfc0f85553a3dad5393da1b188b9ecf8f44407a (patch) | |
tree | fdc7b907981c344cd332d5074e54a6d461b9e3fb | |
parent | 2e034f6f312d752440b9e26afa82b0752c34d97b (diff) |
Adding RTCD for compute_activation()exp_dnn_rtcd6
-rw-r--r-- | dnn/nnet.c | 70 | ||||
-rw-r--r-- | dnn/nnet.h | 9 | ||||
-rw-r--r-- | dnn/nnet_arch.h | 55 | ||||
-rw-r--r-- | dnn/pitchdnn.c | 4 | ||||
-rw-r--r-- | dnn/x86/dnn_x86.h | 22 | ||||
-rw-r--r-- | dnn/x86/x86_dnn_map.c | 13 | ||||
-rw-r--r-- | silk/dred_encoder.c | 12 | ||||
-rw-r--r-- | silk/dred_encoder.h | 2 | ||||
-rw-r--r-- | src/opus_encoder.c | 2 |
9 files changed, 116 insertions, 73 deletions
@@ -52,27 +52,11 @@ #define SOFTMAX_HACK -#define MAX_ACTIVATIONS (4096) - -static OPUS_INLINE void vec_swish(float *y, const float *x, int N) -{ - int i; - float tmp[MAX_ACTIVATIONS]; - celt_assert(N <= MAX_ACTIVATIONS); - vec_sigmoid(tmp, x, N); - for (i=0;i<N;i++) - y[i] = x[i]*tmp[i]; -} - -static OPUS_INLINE float relu(float x) -{ - return x < 0 ? 0 : x; -} void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch) { compute_linear(layer, output, input, arch); - compute_activation(output, output, layer->nb_outputs, activation); + compute_activation(output, output, layer->nb_outputs, activation, arch); } #define MAX_RNN_NEURONS_ALL IMAX(IMAX(FARGAN_MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS) @@ -99,10 +83,10 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re compute_linear(recurrent_weights, recur, state, arch); for (i=0;i<2*N;i++) zrh[i] += recur[i]; - compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID); + compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID, arch); for (i=0;i<N;i++) h[i] += recur[2*N+i]*r[i]; - compute_activation(h, h, N, ACTIVATION_TANH); + compute_activation(h, h, N, ACTIVATION_TANH, arch); for (i=0;i<N;i++) h[i] = z[i]*state[i] + (1-z[i])*h[i]; for (i=0;i<N;i++) @@ -115,7 +99,7 @@ void compute_glu(const LinearLayer *layer, float *output, const float *input, in float act2[MAX_INPUTS]; celt_assert(layer->nb_inputs == layer->nb_outputs); compute_linear(layer, act2, input, arch); - compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID); + compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID, arch); if (input == output) { /* Give a vectorization hint to the compiler for the in-place case. */ for (i=0;i<layer->nb_outputs;i++) output[i] = output[i]*act2[i]; @@ -124,42 +108,6 @@ void compute_glu(const LinearLayer *layer, float *output, const float *input, in } } -void compute_activation(float *output, const float *input, int N, int activation) -{ - int i; - if (activation == ACTIVATION_SIGMOID) { - vec_sigmoid(output, input, N); - } else if (activation == ACTIVATION_TANH) { - vec_tanh(output, input, N); - } else if (activation == ACTIVATION_SWISH) { - vec_swish(output, input, N); - } else if (activation == ACTIVATION_RELU) { - for (i=0;i<N;i++) - output[i] = relu(input[i]); - } else if (activation == ACTIVATION_SOFTMAX) { -#ifdef SOFTMAX_HACK - OPUS_COPY(output, input, N); - /*for (i=0;i<N;i++) - output[i] = input[i];*/ -#else - float sum = 0; - softmax(output, input, N); - for (i=0;i<N;i++) { - sum += output[i]; - } - sum = 1.f/(sum+1e-30); - for (i=0;i<N;i++) - output[i] = sum*output[i]; -#endif - } else { - celt_assert(activation == ACTIVATION_LINEAR); - if (input != output) { - for (i=0;i<N;i++) - output[i] = input[i]; - } - } -} - void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch) { LinearLayer matrix; @@ -174,7 +122,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float * matrix.nb_outputs = layer->nb_neurons; matrix.scale = NULL; compute_linear(&matrix, output, input, arch); - compute_activation(output, output, layer->nb_neurons, layer->activation); + compute_activation(output, output, layer->nb_neurons, layer->activation, arch); } #ifdef USE_SU_BIAS @@ -242,7 +190,7 @@ void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, OPUS_COPY(tmp, mem, layer->nb_inputs-input_size); OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size); compute_linear(layer, output, tmp, arch); - compute_activation(output, output, layer->nb_outputs, activation); + compute_activation(output, output, layer->nb_outputs, activation, arch); OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size); } @@ -257,7 +205,7 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl else for (i=0;i<ksize-1;i++) OPUS_COPY(&tmp[i*input_size], &mem[i*input_size*dilation], input_size); OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size); compute_linear(layer, output, tmp, arch); - compute_activation(output, output, layer->nb_outputs, activation); + compute_activation(output, output, layer->nb_outputs, activation, arch); if (dilation==1) OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size); else { OPUS_COPY(mem, &mem[input_size], input_size*dilation*(ksize-1)-input_size); @@ -325,7 +273,7 @@ static void conv2d_3x3_float(float *out, const float *weights, int in_channels, #define MAX_CONV2D_INPUTS 8192 -void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation) +void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch) { int i; const float *bias; @@ -349,6 +297,6 @@ void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float } } for (i=0;i<conv->out_channels;i++) { - compute_activation(&out[i*hstride], &out[i*hstride], height, activation); + compute_activation(&out[i*hstride], &out[i*hstride], height, activation, arch); } } @@ -133,7 +133,6 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch); void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation, int arch); -void compute_activation(float *output, const float *input, int N, int activation); void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch); @@ -186,11 +185,12 @@ int gru_init(GRULayer *layer, const WeightArray *arrays, int activation, int reset_after); -void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); +void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch); void compute_linear_c(const LinearLayer *linear, float *out, const float *in); +void compute_activation_c(float *output, const float *input, int N, int activation); #if defined(OPUS_X86_MAY_HAVE_SSE2) #include "x86/dnn_x86.h" @@ -200,6 +200,11 @@ void compute_linear_c(const LinearLayer *linear, float *out, const float *in); #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_c(linear, out, in)) #endif +#ifndef OVERRIDE_COMPUTE_ACTIVATION +#define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_c(output, input, N, activation)) +#endif + + #if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2) #if defined(_MSC_VER) #pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance") diff --git a/dnn/nnet_arch.h b/dnn/nnet_arch.h index 00198579..4d577f8d 100644 --- a/dnn/nnet_arch.h +++ b/dnn/nnet_arch.h @@ -38,6 +38,61 @@ #define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH) + +#define MAX_ACTIVATIONS (4096) + +static OPUS_INLINE void vec_swish(float *y, const float *x, int N) +{ + int i; + float tmp[MAX_ACTIVATIONS]; + celt_assert(N <= MAX_ACTIVATIONS); + vec_sigmoid(tmp, x, N); + for (i=0;i<N;i++) + y[i] = x[i]*tmp[i]; +} + +static OPUS_INLINE float relu(float x) +{ + return x < 0 ? 0 : x; +} + +void RTCD_SUF(compute_activation_)(float *output, const float *input, int N, int activation) +{ + int i; + if (activation == ACTIVATION_SIGMOID) { + vec_sigmoid(output, input, N); + } else if (activation == ACTIVATION_TANH) { + vec_tanh(output, input, N); + } else if (activation == ACTIVATION_SWISH) { + vec_swish(output, input, N); + } else if (activation == ACTIVATION_RELU) { + for (i=0;i<N;i++) + output[i] = relu(input[i]); + } else if (activation == ACTIVATION_SOFTMAX) { +#ifdef SOFTMAX_HACK + OPUS_COPY(output, input, N); + /*for (i=0;i<N;i++) + output[i] = input[i];*/ +#else + float sum = 0; + softmax(output, input, N); + for (i=0;i<N;i++) { + sum += output[i]; + } + sum = 1.f/(sum+1e-30); + for (i=0;i<N;i++) + output[i] = sum*output[i]; +#endif + } else { + celt_assert(activation == ACTIVATION_LINEAR); + if (input != output) { + for (i=0;i<N;i++) + output[i] = input[i]; + } + } +} + + void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in) { int i, M, N; diff --git a/dnn/pitchdnn.c b/dnn/pitchdnn.c index 1ca15dc6..ae95ca32 100644 --- a/dnn/pitchdnn.c +++ b/dnn/pitchdnn.c @@ -33,8 +33,8 @@ float compute_pitchdnn( compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH, arch); /* xcorr*/ OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES); - compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH); - compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH); + compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH, arch); + compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH, arch); compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH, arch); compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out, arch); diff --git a/dnn/x86/dnn_x86.h b/dnn/x86/dnn_x86.h index c0f2ffae..94f95ce8 100644 --- a/dnn/x86/dnn_x86.h +++ b/dnn/x86/dnn_x86.h @@ -33,14 +33,17 @@ #if defined(OPUS_X86_MAY_HAVE_SSE2) void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in); +void compute_activation_sse2(float *output, const float *input, int N, int activation); #endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in); +void compute_activation_sse4_1(float *output, const float *input, int N, int activation); #endif #if defined(OPUS_X86_MAY_HAVE_AVX2) void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in); +void compute_activation_avx2(float *output, const float *input, int N, int activation); #endif @@ -48,16 +51,22 @@ void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in) #define OVERRIDE_COMPUTE_LINEAR #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in)) +#define OVERRIDE_COMPUTE_ACTIVATION +#define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_avx2(output, input, N, activation)) #elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2) #define OVERRIDE_COMPUTE_LINEAR #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in)) +#define OVERRIDE_COMPUTE_ACTIVATION +#define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse4_1(output, input, N, activation)) #elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) #define OVERRIDE_COMPUTE_LINEAR #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in)) +#define OVERRIDE_COMPUTE_ACTIVATION +#define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse2(output, input, N, activation)) #elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) @@ -66,11 +75,22 @@ extern void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])( float *out, const float *in ); - #define OVERRIDE_COMPUTE_LINEAR #define compute_linear(linear, out, in, arch) \ ((*DNN_COMPUTE_LINEAR_IMPL[(arch) & OPUS_ARCHMASK])(linear, out, in)) + +extern void (*const DNN_COMPUTE_ACTIVATION_IMPL[OPUS_ARCHMASK + 1])( + float *output, + const float *input, + int N, + int activation + ); +#define OVERRIDE_COMPUTE_ACTIVATION +#define compute_activation(output, input, N, activation, arch) \ + ((*DNN_COMPUTE_ACTIVATION_IMPL[(arch) & OPUS_ARCHMASK])(output, input, N, activation)) + + #endif diff --git a/dnn/x86/x86_dnn_map.c b/dnn/x86/x86_dnn_map.c index 35e061ff..f39ae372 100644 --- a/dnn/x86/x86_dnn_map.c +++ b/dnn/x86/x86_dnn_map.c @@ -48,6 +48,19 @@ void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])( MAY_HAVE_AVX2(compute_linear) /* avx */ }; +void (*const DNN_COMPUTE_ACTIVATION_IMPL[OPUS_ARCHMASK + 1])( + float *output, + const float *input, + int N, + int activation +) = { + compute_activation_c, /* non-sse */ + compute_activation_c, + MAY_HAVE_SSE2(compute_activation), + MAY_HAVE_SSE4_1(compute_activation), /* sse4.1 */ + MAY_HAVE_AVX2(compute_activation) /* avx */ +}; + #endif diff --git a/silk/dred_encoder.c b/silk/dred_encoder.c index 64ff2c7c..f791115a 100644 --- a/silk/dred_encoder.c +++ b/silk/dred_encoder.c @@ -223,7 +223,7 @@ void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int ex } } -static void dred_encode_latents(ec_enc *enc, const float *x, const opus_uint8 *scale, const opus_uint8 *dzone, const opus_uint8 *r, const opus_uint8 *p0, int dim) { +static void dred_encode_latents(ec_enc *enc, const float *x, const opus_uint8 *scale, const opus_uint8 *dzone, const opus_uint8 *r, const opus_uint8 *p0, int dim, int arch) { int i; int q[IMAX(DRED_LATENT_DIM,DRED_STATE_DIM)]; float xq[IMAX(DRED_LATENT_DIM,DRED_STATE_DIM)]; @@ -237,7 +237,7 @@ static void dred_encode_latents(ec_enc *enc, const float *x, const opus_uint8 *s xq[i] = x[i]*scale[i]*(1.f/256.f); deadzone[i] = xq[i]/(delta[i]+eps); } - compute_activation(deadzone, deadzone, dim, ACTIVATION_TANH); + compute_activation(deadzone, deadzone, dim, ACTIVATION_TANH, arch); for (i=0;i<dim;i++) { xq[i] = xq[i] - delta[i]*deadzone[i]; q[i] = (int)floor(.5f+xq[i]); @@ -249,7 +249,7 @@ static void dred_encode_latents(ec_enc *enc, const float *x, const opus_uint8 *s } } -int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes) { +int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes, int arch) { ec_enc ec_encoder; int q_level; @@ -275,7 +275,8 @@ int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunk dred_state_dead_zone_q8 + state_qoffset, dred_state_r_q8 + state_qoffset, dred_state_p0_q8 + state_qoffset, - DRED_STATE_DIM); + DRED_STATE_DIM, + arch); if (ec_tell(&ec_encoder) > 8*max_bytes) { return 0; } @@ -294,7 +295,8 @@ int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunk dred_latent_dead_zone_q8 + offset, dred_latent_r_q8 + offset, dred_latent_p0_q8 + offset, - DRED_LATENT_DIM + DRED_LATENT_DIM, + arch ); if (ec_tell(&ec_encoder) > 8*max_bytes) { ec_encoder = ec_bak; diff --git a/silk/dred_encoder.h b/silk/dred_encoder.h index d1d2376d..795bee4f 100644 --- a/silk/dred_encoder.h +++ b/silk/dred_encoder.h @@ -66,6 +66,6 @@ void dred_deinit_encoder(DREDEnc *enc); void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch); -int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes); +int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes, int arch); #endif diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 28da18af..69197494 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -2272,7 +2272,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ buf[0] = 'D'; buf[1] = DRED_EXPERIMENTAL_VERSION; #endif - dred_bytes = dred_encode_silk_frame(&st->dred_encoder, buf+DRED_EXPERIMENTAL_BYTES, dred_chunks, dred_bytes_left-DRED_EXPERIMENTAL_BYTES); + dred_bytes = dred_encode_silk_frame(&st->dred_encoder, buf+DRED_EXPERIMENTAL_BYTES, dred_chunks, dred_bytes_left-DRED_EXPERIMENTAL_BYTES, st->arch); if (dred_bytes > 0) { dred_bytes += DRED_EXPERIMENTAL_BYTES; celt_assert(dred_bytes <= dred_bytes_left); |