Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@amazon.com>2023-10-20 08:33:49 +0300
committerJean-Marc Valin <jmvalin@amazon.com>2023-10-20 08:33:49 +0300
commitf512c9206beef41367dff3e0c81fffc374b35efc (patch)
tree06dd6245a6825f3bae15aedf0ff7b212bafe752c
parentd720955d617768aea7271076f3993e6263e8b84f (diff)
Unroll the 3x3 convolution case
Gets us about 2x speedup on x86
-rw-r--r--dnn/nnet.c33
1 files changed, 32 insertions, 1 deletions
diff --git a/dnn/nnet.c b/dnn/nnet.c
index 97ac74f3..179e7063 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -394,6 +394,34 @@ void conv2d_float(float *out, const float *weights, int in_channels, int out_cha
}
}
+void conv2d_3x3_float(float *out, const float *weights, int in_channels, int out_channels, const float *in, int height, int hstride)
+{
+ int i;
+ int in_stride;
+ int kheight, ktime;
+ kheight = ktime = 3;
+ in_stride = height+kheight-1;
+ for (i=0;i<out_channels;i++) {
+ int m;
+ OPUS_CLEAR(&out[i*hstride], height);
+ for (m=0;m<in_channels;m++) {
+ int j;
+ for (j=0;j<height;j++) {
+ /* Unrolled version of previous function -- compiler will figure out the indexing simplifications. */
+ out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 0]*in[0*in_channels*in_stride + m*in_stride + j + 0]
+ + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 1]*in[0*in_channels*in_stride + m*in_stride + j + 1]
+ + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 2]*in[0*in_channels*in_stride + m*in_stride + j + 2]
+ + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 0]*in[1*in_channels*in_stride + m*in_stride + j + 0]
+ + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 1]*in[1*in_channels*in_stride + m*in_stride + j + 1]
+ + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 2]*in[1*in_channels*in_stride + m*in_stride + j + 2]
+ + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 0]*in[2*in_channels*in_stride + m*in_stride + j + 0]
+ + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 1]*in[2*in_channels*in_stride + m*in_stride + j + 1]
+ + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 2]*in[2*in_channels*in_stride + m*in_stride + j + 2];
+ }
+ }
+ }
+}
+
#define MAX_CONV2D_INPUTS 8192
void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation)
@@ -409,7 +437,10 @@ void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float
OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);
OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);
bias = conv->bias;
- conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);
+ if (conv->kheight == 3 && conv->ktime == 3)
+ conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride);
+ else
+ conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);
if (bias != NULL) {
for (i=0;i<conv->out_channels;i++) {
int j;