diff options
author | Jean-Marc Valin <jmvalin@amazon.com> | 2023-11-17 22:14:03 +0300 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@amazon.com> | 2023-11-17 22:20:09 +0300 |
commit | a93b09e2417ed191d87788d0dbf8b09d053fd59f (patch) | |
tree | 17069d15b38863680a22aa84a5f590e68b9533e8 | |
parent | 91d1f7539ec1a12f6dabc366cec70faac5288b34 (diff) |
Adding RTCD for compute_conv2d()
-rw-r--r-- | dnn/nnet.c | 88 | ||||
-rw-r--r-- | dnn/nnet.h | 8 | ||||
-rw-r--r-- | dnn/nnet_arch.h | 88 | ||||
-rw-r--r-- | dnn/x86/dnn_x86.h | 23 | ||||
-rw-r--r-- | dnn/x86/x86_dnn_map.c | 16 |
5 files changed, 132 insertions, 91 deletions
@@ -212,91 +212,3 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl OPUS_COPY(&mem[input_size*dilation*(ksize-1)-input_size], input, input_size); } } - - -/* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ], - kernel [ out_channels x in_channels x ksize1 x ksize2 ], - storing the output as [ out_channels x len2 ]. - We assume that the output dimension along the ksize1 axis is 1, - i.e. processing one frame at a time. */ -static void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride) -{ - int i; - int in_stride; - in_stride = height+kheight-1; - for (i=0;i<out_channels;i++) { - int m; - OPUS_CLEAR(&out[i*hstride], height); - for (m=0;m<in_channels;m++) { - int t; - for (t=0;t<ktime;t++) { - int h; - for (h=0;h<kheight;h++) { - int j; - for (j=0;j<height;j++) { - out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] * - in[t*in_channels*in_stride + m*in_stride + j + h]; - } - } - } - } - } -} - -static void conv2d_3x3_float(float *out, const float *weights, int in_channels, int out_channels, const float *in, int height, int hstride) -{ - int i; - int in_stride; - int kheight, ktime; - kheight = ktime = 3; - in_stride = height+kheight-1; - for (i=0;i<out_channels;i++) { - int m; - OPUS_CLEAR(&out[i*hstride], height); - for (m=0;m<in_channels;m++) { - int j; - for (j=0;j<height;j++) { - /* Unrolled version of previous function -- compiler will figure out the indexing simplifications. */ - out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 0]*in[0*in_channels*in_stride + m*in_stride + j + 0] - + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 1]*in[0*in_channels*in_stride + m*in_stride + j + 1] - + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 2]*in[0*in_channels*in_stride + m*in_stride + j + 2] - + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 0]*in[1*in_channels*in_stride + m*in_stride + j + 0] - + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 1]*in[1*in_channels*in_stride + m*in_stride + j + 1] - + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 2]*in[1*in_channels*in_stride + m*in_stride + j + 2] - + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 0]*in[2*in_channels*in_stride + m*in_stride + j + 0] - + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 1]*in[2*in_channels*in_stride + m*in_stride + j + 1] - + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 2]*in[2*in_channels*in_stride + m*in_stride + j + 2]; - } - } - } -} - -#define MAX_CONV2D_INPUTS 8192 - -void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch) -{ - int i; - const float *bias; - float in_buf[MAX_CONV2D_INPUTS]; - int time_stride; - celt_assert(in != out); - time_stride = conv->in_channels*(height+conv->kheight-1); - celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS); - OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride); - OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride); - OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride); - bias = conv->bias; - if (conv->kheight == 3 && conv->ktime == 3) - conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride); - else - conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride); - if (bias != NULL) { - for (i=0;i<conv->out_channels;i++) { - int j; - for (j=0;j<height;j++) out[i*hstride+j] += bias[i]; - } - } - for (i=0;i<conv->out_channels;i++) { - compute_activation(&out[i*hstride], &out[i*hstride], height, activation, arch); - } -} @@ -185,12 +185,11 @@ int gru_init(GRULayer *layer, const WeightArray *arrays, int activation, int reset_after); -void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch); - - void compute_linear_c(const LinearLayer *linear, float *out, const float *in); void compute_activation_c(float *output, const float *input, int N, int activation); +void compute_conv2d_c(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); + #if defined(OPUS_X86_MAY_HAVE_SSE2) #include "x86/dnn_x86.h" @@ -204,6 +203,9 @@ void compute_activation_c(float *output, const float *input, int N, int activati #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_c(output, input, N, activation)) #endif +#ifndef OVERRIDE_COMPUTE_CONV2D +#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_c(conv, out, mem, in, height, hstride, activation)) +#endif #if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2) #if defined(_MSC_VER) diff --git a/dnn/nnet_arch.h b/dnn/nnet_arch.h index 4d577f8d..3c53e619 100644 --- a/dnn/nnet_arch.h +++ b/dnn/nnet_arch.h @@ -127,5 +127,93 @@ void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const flo } } +/* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ], + kernel [ out_channels x in_channels x ksize1 x ksize2 ], + storing the output as [ out_channels x len2 ]. + We assume that the output dimension along the ksize1 axis is 1, + i.e. processing one frame at a time. */ +static void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride) +{ + int i; + int in_stride; + in_stride = height+kheight-1; + for (i=0;i<out_channels;i++) { + int m; + OPUS_CLEAR(&out[i*hstride], height); + for (m=0;m<in_channels;m++) { + int t; + for (t=0;t<ktime;t++) { + int h; + for (h=0;h<kheight;h++) { + int j; + for (j=0;j<height;j++) { + out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] * + in[t*in_channels*in_stride + m*in_stride + j + h]; + } + } + } + } + } +} + +/* There's no intrinsics in this function (or the one above) because the gcc (and hopefully other compiler) auto-vectorizer is smart enough to + produce the right code by itself based on the compile flags. */ +static void conv2d_3x3_float(float *out, const float *weights, int in_channels, int out_channels, const float *in, int height, int hstride) +{ + int i; + int in_stride; + int kheight, ktime; + kheight = ktime = 3; + in_stride = height+kheight-1; + for (i=0;i<out_channels;i++) { + int m; + OPUS_CLEAR(&out[i*hstride], height); + for (m=0;m<in_channels;m++) { + int j; + for (j=0;j<height;j++) { + /* Unrolled version of previous function -- compiler will figure out the indexing simplifications. */ + out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 0]*in[0*in_channels*in_stride + m*in_stride + j + 0] + + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 1]*in[0*in_channels*in_stride + m*in_stride + j + 1] + + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 2]*in[0*in_channels*in_stride + m*in_stride + j + 2] + + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 0]*in[1*in_channels*in_stride + m*in_stride + j + 0] + + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 1]*in[1*in_channels*in_stride + m*in_stride + j + 1] + + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 2]*in[1*in_channels*in_stride + m*in_stride + j + 2] + + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 0]*in[2*in_channels*in_stride + m*in_stride + j + 0] + + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 1]*in[2*in_channels*in_stride + m*in_stride + j + 1] + + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 2]*in[2*in_channels*in_stride + m*in_stride + j + 2]; + } + } + } +} + +#define MAX_CONV2D_INPUTS 8192 + +void RTCD_SUF(compute_conv2d_)(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation) +{ + int i; + const float *bias; + float in_buf[MAX_CONV2D_INPUTS]; + int time_stride; + celt_assert(in != out); + time_stride = conv->in_channels*(height+conv->kheight-1); + celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS); + OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride); + OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride); + OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride); + bias = conv->bias; + if (conv->kheight == 3 && conv->ktime == 3) + conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride); + else + conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride); + if (bias != NULL) { + for (i=0;i<conv->out_channels;i++) { + int j; + for (j=0;j<height;j++) out[i*hstride+j] += bias[i]; + } + } + for (i=0;i<conv->out_channels;i++) { + RTCD_SUF(compute_activation_)(&out[i*hstride], &out[i*hstride], height, activation); + } +} #endif diff --git a/dnn/x86/dnn_x86.h b/dnn/x86/dnn_x86.h index 94f95ce8..f2183327 100644 --- a/dnn/x86/dnn_x86.h +++ b/dnn/x86/dnn_x86.h @@ -34,16 +34,19 @@ #if defined(OPUS_X86_MAY_HAVE_SSE2) void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in); void compute_activation_sse2(float *output, const float *input, int N, int activation); +void compute_conv2d_sse2(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); #endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in); void compute_activation_sse4_1(float *output, const float *input, int N, int activation); +void compute_conv2d_sse4_1(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); #endif #if defined(OPUS_X86_MAY_HAVE_AVX2) void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in); void compute_activation_avx2(float *output, const float *input, int N, int activation); +void compute_conv2d_avx2(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); #endif @@ -53,6 +56,8 @@ void compute_activation_avx2(float *output, const float *input, int N, int activ #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in)) #define OVERRIDE_COMPUTE_ACTIVATION #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_avx2(output, input, N, activation)) +#define OVERRIDE_COMPUTE_CONV2D +#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_avx2(conv, out, mem, in, height, hstride, activation)) #elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2) @@ -60,6 +65,8 @@ void compute_activation_avx2(float *output, const float *input, int N, int activ #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in)) #define OVERRIDE_COMPUTE_ACTIVATION #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse4_1(output, input, N, activation)) +#define OVERRIDE_COMPUTE_CONV2D +#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_sse4_1(conv, out, mem, in, height, hstride, activation)) #elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) @@ -67,6 +74,8 @@ void compute_activation_avx2(float *output, const float *input, int N, int activ #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in)) #define OVERRIDE_COMPUTE_ACTIVATION #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse2(output, input, N, activation)) +#define OVERRIDE_COMPUTE_CONV2D +#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_sse2(conv, out, mem, in, height, hstride, activation)) #elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) @@ -91,6 +100,20 @@ extern void (*const DNN_COMPUTE_ACTIVATION_IMPL[OPUS_ARCHMASK + 1])( ((*DNN_COMPUTE_ACTIVATION_IMPL[(arch) & OPUS_ARCHMASK])(output, input, N, activation)) +extern void (*const DNN_COMPUTE_CONV2D_IMPL[OPUS_ARCHMASK + 1])( + const Conv2dLayer *conv, + float *out, + float *mem, + const float *in, + int height, + int hstride, + int activation + ); +#define OVERRIDE_COMPUTE_CONV2D +#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) \ + ((*DNN_COMPUTE_CONV2D_IMPL[(arch) & OPUS_ARCHMASK])(conv, out, mem, in, height, hstride, activation)) + + #endif diff --git a/dnn/x86/x86_dnn_map.c b/dnn/x86/x86_dnn_map.c index f39ae372..d673e134 100644 --- a/dnn/x86/x86_dnn_map.c +++ b/dnn/x86/x86_dnn_map.c @@ -61,6 +61,22 @@ void (*const DNN_COMPUTE_ACTIVATION_IMPL[OPUS_ARCHMASK + 1])( MAY_HAVE_AVX2(compute_activation) /* avx */ }; +void (*const DNN_COMPUTE_CONV2D_IMPL[OPUS_ARCHMASK + 1])( + const Conv2dLayer *conv, + float *out, + float *mem, + const float *in, + int height, + int hstride, + int activation +) = { + compute_conv2d_c, /* non-sse */ + compute_conv2d_c, + MAY_HAVE_SSE2(compute_conv2d), + MAY_HAVE_SSE4_1(compute_conv2d), /* sse4.1 */ + MAY_HAVE_AVX2(compute_conv2d) /* avx */ +}; + #endif |