Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@amazon.com>2023-11-15 01:54:05 +0300
committerJean-Marc Valin <jmvalin@amazon.com>2023-11-16 07:46:01 +0300
commit4bfc0f85553a3dad5393da1b188b9ecf8f44407a (patch)
treefdc7b907981c344cd332d5074e54a6d461b9e3fb
parent2e034f6f312d752440b9e26afa82b0752c34d97b (diff)
Adding RTCD for compute_activation()exp_dnn_rtcd6
-rw-r--r--dnn/nnet.c70
-rw-r--r--dnn/nnet.h9
-rw-r--r--dnn/nnet_arch.h55
-rw-r--r--dnn/pitchdnn.c4
-rw-r--r--dnn/x86/dnn_x86.h22
-rw-r--r--dnn/x86/x86_dnn_map.c13
-rw-r--r--silk/dred_encoder.c12
-rw-r--r--silk/dred_encoder.h2
-rw-r--r--src/opus_encoder.c2
9 files changed, 116 insertions, 73 deletions
diff --git a/dnn/nnet.c b/dnn/nnet.c
index 22fda89b..a82c04ab 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -52,27 +52,11 @@
#define SOFTMAX_HACK
-#define MAX_ACTIVATIONS (4096)
-
-static OPUS_INLINE void vec_swish(float *y, const float *x, int N)
-{
- int i;
- float tmp[MAX_ACTIVATIONS];
- celt_assert(N <= MAX_ACTIVATIONS);
- vec_sigmoid(tmp, x, N);
- for (i=0;i<N;i++)
- y[i] = x[i]*tmp[i];
-}
-
-static OPUS_INLINE float relu(float x)
-{
- return x < 0 ? 0 : x;
-}
void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch)
{
compute_linear(layer, output, input, arch);
- compute_activation(output, output, layer->nb_outputs, activation);
+ compute_activation(output, output, layer->nb_outputs, activation, arch);
}
#define MAX_RNN_NEURONS_ALL IMAX(IMAX(FARGAN_MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS)
@@ -99,10 +83,10 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
compute_linear(recurrent_weights, recur, state, arch);
for (i=0;i<2*N;i++)
zrh[i] += recur[i];
- compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
+ compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID, arch);
for (i=0;i<N;i++)
h[i] += recur[2*N+i]*r[i];
- compute_activation(h, h, N, ACTIVATION_TANH);
+ compute_activation(h, h, N, ACTIVATION_TANH, arch);
for (i=0;i<N;i++)
h[i] = z[i]*state[i] + (1-z[i])*h[i];
for (i=0;i<N;i++)
@@ -115,7 +99,7 @@ void compute_glu(const LinearLayer *layer, float *output, const float *input, in
float act2[MAX_INPUTS];
celt_assert(layer->nb_inputs == layer->nb_outputs);
compute_linear(layer, act2, input, arch);
- compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID);
+ compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID, arch);
if (input == output) {
/* Give a vectorization hint to the compiler for the in-place case. */
for (i=0;i<layer->nb_outputs;i++) output[i] = output[i]*act2[i];
@@ -124,42 +108,6 @@ void compute_glu(const LinearLayer *layer, float *output, const float *input, in
}
}
-void compute_activation(float *output, const float *input, int N, int activation)
-{
- int i;
- if (activation == ACTIVATION_SIGMOID) {
- vec_sigmoid(output, input, N);
- } else if (activation == ACTIVATION_TANH) {
- vec_tanh(output, input, N);
- } else if (activation == ACTIVATION_SWISH) {
- vec_swish(output, input, N);
- } else if (activation == ACTIVATION_RELU) {
- for (i=0;i<N;i++)
- output[i] = relu(input[i]);
- } else if (activation == ACTIVATION_SOFTMAX) {
-#ifdef SOFTMAX_HACK
- OPUS_COPY(output, input, N);
- /*for (i=0;i<N;i++)
- output[i] = input[i];*/
-#else
- float sum = 0;
- softmax(output, input, N);
- for (i=0;i<N;i++) {
- sum += output[i];
- }
- sum = 1.f/(sum+1e-30);
- for (i=0;i<N;i++)
- output[i] = sum*output[i];
-#endif
- } else {
- celt_assert(activation == ACTIVATION_LINEAR);
- if (input != output) {
- for (i=0;i<N;i++)
- output[i] = input[i];
- }
- }
-}
-
void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch)
{
LinearLayer matrix;
@@ -174,7 +122,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
matrix.nb_outputs = layer->nb_neurons;
matrix.scale = NULL;
compute_linear(&matrix, output, input, arch);
- compute_activation(output, output, layer->nb_neurons, layer->activation);
+ compute_activation(output, output, layer->nb_neurons, layer->activation, arch);
}
#ifdef USE_SU_BIAS
@@ -242,7 +190,7 @@ void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem,
OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
compute_linear(layer, output, tmp, arch);
- compute_activation(output, output, layer->nb_outputs, activation);
+ compute_activation(output, output, layer->nb_outputs, activation, arch);
OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
}
@@ -257,7 +205,7 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl
else for (i=0;i<ksize-1;i++) OPUS_COPY(&tmp[i*input_size], &mem[i*input_size*dilation], input_size);
OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
compute_linear(layer, output, tmp, arch);
- compute_activation(output, output, layer->nb_outputs, activation);
+ compute_activation(output, output, layer->nb_outputs, activation, arch);
if (dilation==1) OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
else {
OPUS_COPY(mem, &mem[input_size], input_size*dilation*(ksize-1)-input_size);
@@ -325,7 +273,7 @@ static void conv2d_3x3_float(float *out, const float *weights, int in_channels,
#define MAX_CONV2D_INPUTS 8192
-void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation)
+void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch)
{
int i;
const float *bias;
@@ -349,6 +297,6 @@ void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float
}
}
for (i=0;i<conv->out_channels;i++) {
- compute_activation(&out[i*hstride], &out[i*hstride], height, activation);
+ compute_activation(&out[i*hstride], &out[i*hstride], height, activation, arch);
}
}
diff --git a/dnn/nnet.h b/dnn/nnet.h
index c8240ffc..f891fa3e 100644
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -133,7 +133,6 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl
void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch);
void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation, int arch);
-void compute_activation(float *output, const float *input, int N, int activation);
void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch);
@@ -186,11 +185,12 @@ int gru_init(GRULayer *layer, const WeightArray *arrays,
int activation,
int reset_after);
-void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);
+void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch);
void compute_linear_c(const LinearLayer *linear, float *out, const float *in);
+void compute_activation_c(float *output, const float *input, int N, int activation);
#if defined(OPUS_X86_MAY_HAVE_SSE2)
#include "x86/dnn_x86.h"
@@ -200,6 +200,11 @@ void compute_linear_c(const LinearLayer *linear, float *out, const float *in);
#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_c(linear, out, in))
#endif
+#ifndef OVERRIDE_COMPUTE_ACTIVATION
+#define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_c(output, input, N, activation))
+#endif
+
+
#if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
#if defined(_MSC_VER)
#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
diff --git a/dnn/nnet_arch.h b/dnn/nnet_arch.h
index 00198579..4d577f8d 100644
--- a/dnn/nnet_arch.h
+++ b/dnn/nnet_arch.h
@@ -38,6 +38,61 @@
#define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH)
+
+#define MAX_ACTIVATIONS (4096)
+
+static OPUS_INLINE void vec_swish(float *y, const float *x, int N)
+{
+ int i;
+ float tmp[MAX_ACTIVATIONS];
+ celt_assert(N <= MAX_ACTIVATIONS);
+ vec_sigmoid(tmp, x, N);
+ for (i=0;i<N;i++)
+ y[i] = x[i]*tmp[i];
+}
+
+static OPUS_INLINE float relu(float x)
+{
+ return x < 0 ? 0 : x;
+}
+
+void RTCD_SUF(compute_activation_)(float *output, const float *input, int N, int activation)
+{
+ int i;
+ if (activation == ACTIVATION_SIGMOID) {
+ vec_sigmoid(output, input, N);
+ } else if (activation == ACTIVATION_TANH) {
+ vec_tanh(output, input, N);
+ } else if (activation == ACTIVATION_SWISH) {
+ vec_swish(output, input, N);
+ } else if (activation == ACTIVATION_RELU) {
+ for (i=0;i<N;i++)
+ output[i] = relu(input[i]);
+ } else if (activation == ACTIVATION_SOFTMAX) {
+#ifdef SOFTMAX_HACK
+ OPUS_COPY(output, input, N);
+ /*for (i=0;i<N;i++)
+ output[i] = input[i];*/
+#else
+ float sum = 0;
+ softmax(output, input, N);
+ for (i=0;i<N;i++) {
+ sum += output[i];
+ }
+ sum = 1.f/(sum+1e-30);
+ for (i=0;i<N;i++)
+ output[i] = sum*output[i];
+#endif
+ } else {
+ celt_assert(activation == ACTIVATION_LINEAR);
+ if (input != output) {
+ for (i=0;i<N;i++)
+ output[i] = input[i];
+ }
+ }
+}
+
+
void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in)
{
int i, M, N;
diff --git a/dnn/pitchdnn.c b/dnn/pitchdnn.c
index 1ca15dc6..ae95ca32 100644
--- a/dnn/pitchdnn.c
+++ b/dnn/pitchdnn.c
@@ -33,8 +33,8 @@ float compute_pitchdnn(
compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH, arch);
/* xcorr*/
OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES);
- compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH);
- compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH);
+ compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH, arch);
+ compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH, arch);
compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH, arch);
compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out, arch);
diff --git a/dnn/x86/dnn_x86.h b/dnn/x86/dnn_x86.h
index c0f2ffae..94f95ce8 100644
--- a/dnn/x86/dnn_x86.h
+++ b/dnn/x86/dnn_x86.h
@@ -33,14 +33,17 @@
#if defined(OPUS_X86_MAY_HAVE_SSE2)
void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in);
+void compute_activation_sse2(float *output, const float *input, int N, int activation);
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in);
+void compute_activation_sse4_1(float *output, const float *input, int N, int activation);
#endif
#if defined(OPUS_X86_MAY_HAVE_AVX2)
void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in);
+void compute_activation_avx2(float *output, const float *input, int N, int activation);
#endif
@@ -48,16 +51,22 @@ void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in)
#define OVERRIDE_COMPUTE_LINEAR
#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in))
+#define OVERRIDE_COMPUTE_ACTIVATION
+#define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_avx2(output, input, N, activation))
#elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
#define OVERRIDE_COMPUTE_LINEAR
#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in))
+#define OVERRIDE_COMPUTE_ACTIVATION
+#define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse4_1(output, input, N, activation))
#elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
#define OVERRIDE_COMPUTE_LINEAR
#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in))
+#define OVERRIDE_COMPUTE_ACTIVATION
+#define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse2(output, input, N, activation))
#elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
@@ -66,11 +75,22 @@ extern void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
float *out,
const float *in
);
-
#define OVERRIDE_COMPUTE_LINEAR
#define compute_linear(linear, out, in, arch) \
((*DNN_COMPUTE_LINEAR_IMPL[(arch) & OPUS_ARCHMASK])(linear, out, in))
+
+extern void (*const DNN_COMPUTE_ACTIVATION_IMPL[OPUS_ARCHMASK + 1])(
+ float *output,
+ const float *input,
+ int N,
+ int activation
+ );
+#define OVERRIDE_COMPUTE_ACTIVATION
+#define compute_activation(output, input, N, activation, arch) \
+ ((*DNN_COMPUTE_ACTIVATION_IMPL[(arch) & OPUS_ARCHMASK])(output, input, N, activation))
+
+
#endif
diff --git a/dnn/x86/x86_dnn_map.c b/dnn/x86/x86_dnn_map.c
index 35e061ff..f39ae372 100644
--- a/dnn/x86/x86_dnn_map.c
+++ b/dnn/x86/x86_dnn_map.c
@@ -48,6 +48,19 @@ void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
MAY_HAVE_AVX2(compute_linear) /* avx */
};
+void (*const DNN_COMPUTE_ACTIVATION_IMPL[OPUS_ARCHMASK + 1])(
+ float *output,
+ const float *input,
+ int N,
+ int activation
+) = {
+ compute_activation_c, /* non-sse */
+ compute_activation_c,
+ MAY_HAVE_SSE2(compute_activation),
+ MAY_HAVE_SSE4_1(compute_activation), /* sse4.1 */
+ MAY_HAVE_AVX2(compute_activation) /* avx */
+};
+
#endif
diff --git a/silk/dred_encoder.c b/silk/dred_encoder.c
index 64ff2c7c..f791115a 100644
--- a/silk/dred_encoder.c
+++ b/silk/dred_encoder.c
@@ -223,7 +223,7 @@ void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int ex
}
}
-static void dred_encode_latents(ec_enc *enc, const float *x, const opus_uint8 *scale, const opus_uint8 *dzone, const opus_uint8 *r, const opus_uint8 *p0, int dim) {
+static void dred_encode_latents(ec_enc *enc, const float *x, const opus_uint8 *scale, const opus_uint8 *dzone, const opus_uint8 *r, const opus_uint8 *p0, int dim, int arch) {
int i;
int q[IMAX(DRED_LATENT_DIM,DRED_STATE_DIM)];
float xq[IMAX(DRED_LATENT_DIM,DRED_STATE_DIM)];
@@ -237,7 +237,7 @@ static void dred_encode_latents(ec_enc *enc, const float *x, const opus_uint8 *s
xq[i] = x[i]*scale[i]*(1.f/256.f);
deadzone[i] = xq[i]/(delta[i]+eps);
}
- compute_activation(deadzone, deadzone, dim, ACTIVATION_TANH);
+ compute_activation(deadzone, deadzone, dim, ACTIVATION_TANH, arch);
for (i=0;i<dim;i++) {
xq[i] = xq[i] - delta[i]*deadzone[i];
q[i] = (int)floor(.5f+xq[i]);
@@ -249,7 +249,7 @@ static void dred_encode_latents(ec_enc *enc, const float *x, const opus_uint8 *s
}
}
-int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes) {
+int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes, int arch) {
ec_enc ec_encoder;
int q_level;
@@ -275,7 +275,8 @@ int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunk
dred_state_dead_zone_q8 + state_qoffset,
dred_state_r_q8 + state_qoffset,
dred_state_p0_q8 + state_qoffset,
- DRED_STATE_DIM);
+ DRED_STATE_DIM,
+ arch);
if (ec_tell(&ec_encoder) > 8*max_bytes) {
return 0;
}
@@ -294,7 +295,8 @@ int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunk
dred_latent_dead_zone_q8 + offset,
dred_latent_r_q8 + offset,
dred_latent_p0_q8 + offset,
- DRED_LATENT_DIM
+ DRED_LATENT_DIM,
+ arch
);
if (ec_tell(&ec_encoder) > 8*max_bytes) {
ec_encoder = ec_bak;
diff --git a/silk/dred_encoder.h b/silk/dred_encoder.h
index d1d2376d..795bee4f 100644
--- a/silk/dred_encoder.h
+++ b/silk/dred_encoder.h
@@ -66,6 +66,6 @@ void dred_deinit_encoder(DREDEnc *enc);
void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch);
-int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes);
+int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes, int arch);
#endif
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 28da18af..69197494 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -2272,7 +2272,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
buf[0] = 'D';
buf[1] = DRED_EXPERIMENTAL_VERSION;
#endif
- dred_bytes = dred_encode_silk_frame(&st->dred_encoder, buf+DRED_EXPERIMENTAL_BYTES, dred_chunks, dred_bytes_left-DRED_EXPERIMENTAL_BYTES);
+ dred_bytes = dred_encode_silk_frame(&st->dred_encoder, buf+DRED_EXPERIMENTAL_BYTES, dred_chunks, dred_bytes_left-DRED_EXPERIMENTAL_BYTES, st->arch);
if (dred_bytes > 0) {
dred_bytes += DRED_EXPERIMENTAL_BYTES;
celt_assert(dred_bytes <= dred_bytes_left);