Adding RTCD for compute_conv2d()

author: Jean-Marc Valin <jmvalin@amazon.com> 2023-11-17 22:14:03 +0300
committer: Jean-Marc Valin <jmvalin@amazon.com> 2023-11-17 22:20:09 +0300
commit: a93b09e2417ed191d87788d0dbf8b09d053fd59f (patch)
tree: 17069d15b38863680a22aa84a5f590e68b9533e8
parent: 91d1f7539ec1a12f6dabc366cec70faac5288b34 (diff)
5 files changed, 132 insertions, 91 deletions
diff --git a/dnn/nnet.c b/dnn/nnet.c
index a82c04ab..e794e450 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -212,91 +212,3 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl
      OPUS_COPY(&mem[input_size*dilation*(ksize-1)-input_size], input, input_size);
    }
 }
-
-
-/* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ],
-   kernel [ out_channels x in_channels x ksize1 x ksize2 ],
-   storing the output as [ out_channels x len2 ].
-   We assume that the output dimension along the ksize1 axis is 1,
-   i.e. processing one frame at a time. */
-static void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride)
-{
-   int i;
-   int in_stride;
-   in_stride = height+kheight-1;
-   for (i=0;i<out_channels;i++) {
-      int m;
-      OPUS_CLEAR(&out[i*hstride], height);
-      for (m=0;m<in_channels;m++) {
-         int t;
-         for (t=0;t<ktime;t++) {
-            int h;
-            for (h=0;h<kheight;h++) {
-               int j;
-               for (j=0;j<height;j++) {
-                  out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] *
-                                     in[t*in_channels*in_stride + m*in_stride + j + h];
-               }
-            }
-         }
-      }
-   }
-}
-
-static void conv2d_3x3_float(float *out, const float *weights, int in_channels, int out_channels, const float *in, int height, int hstride)
-{
-   int i;
-   int in_stride;
-   int kheight, ktime;
-   kheight = ktime = 3;
-   in_stride = height+kheight-1;
-   for (i=0;i<out_channels;i++) {
-      int m;
-      OPUS_CLEAR(&out[i*hstride], height);
-      for (m=0;m<in_channels;m++) {
-         int j;
-         for (j=0;j<height;j++) {
-            /* Unrolled version of previous function -- compiler will figure out the indexing simplifications. */
-            out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 0]*in[0*in_channels*in_stride + m*in_stride + j + 0]
-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 1]*in[0*in_channels*in_stride + m*in_stride + j + 1]
-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 2]*in[0*in_channels*in_stride + m*in_stride + j + 2]
-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 0]*in[1*in_channels*in_stride + m*in_stride + j + 0]
-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 1]*in[1*in_channels*in_stride + m*in_stride + j + 1]
-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 2]*in[1*in_channels*in_stride + m*in_stride + j + 2]
-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 0]*in[2*in_channels*in_stride + m*in_stride + j + 0]
-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 1]*in[2*in_channels*in_stride + m*in_stride + j + 1]
-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 2]*in[2*in_channels*in_stride + m*in_stride + j + 2];
-               }
-      }
-   }
-}
-
-#define MAX_CONV2D_INPUTS 8192
-
-void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch)
-{
-   int i;
-   const float *bias;
-   float in_buf[MAX_CONV2D_INPUTS];
-   int time_stride;
-   celt_assert(in != out);
-   time_stride = conv->in_channels*(height+conv->kheight-1);
-   celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS);
-   OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride);
-   OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);
-   OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);
-   bias = conv->bias;
-   if (conv->kheight == 3 && conv->ktime == 3)
-     conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride);
-   else
-     conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);
-   if (bias != NULL) {
-     for (i=0;i<conv->out_channels;i++) {
-       int j;
-       for (j=0;j<height;j++) out[i*hstride+j] += bias[i];
-     }
-   }
-   for (i=0;i<conv->out_channels;i++) {
-     compute_activation(&out[i*hstride], &out[i*hstride], height, activation, arch);
-   }
-}
diff --git a/dnn/nnet.h b/dnn/nnet.h
index f891fa3e..4a42beca 100644
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -185,12 +185,11 @@ int gru_init(GRULayer *layer, const WeightArray *arrays,
   int activation,
   int reset_after);
 
-void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch);
-
-
 
 void compute_linear_c(const LinearLayer *linear, float *out, const float *in);
 void compute_activation_c(float *output, const float *input, int N, int activation);
+void compute_conv2d_c(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);
+
 
 #if defined(OPUS_X86_MAY_HAVE_SSE2)
 #include "x86/dnn_x86.h"
@@ -204,6 +203,9 @@ void compute_activation_c(float *output, const float *input, int N, int activati
 #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_c(output, input, N, activation))
 #endif
 
+#ifndef OVERRIDE_COMPUTE_CONV2D
+#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_c(conv, out, mem, in, height, hstride, activation))
+#endif
 
 #if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
 #if defined(_MSC_VER)
diff --git a/dnn/nnet_arch.h b/dnn/nnet_arch.h
index 4d577f8d..3c53e619 100644
--- a/dnn/nnet_arch.h
+++ b/dnn/nnet_arch.h
@@ -127,5 +127,93 @@ void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const flo
    }
 }
 
+/* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ],
+   kernel [ out_channels x in_channels x ksize1 x ksize2 ],
+   storing the output as [ out_channels x len2 ].
+   We assume that the output dimension along the ksize1 axis is 1,
+   i.e. processing one frame at a time. */
+static void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride)
+{
+   int i;
+   int in_stride;
+   in_stride = height+kheight-1;
+   for (i=0;i<out_channels;i++) {
+      int m;
+      OPUS_CLEAR(&out[i*hstride], height);
+      for (m=0;m<in_channels;m++) {
+         int t;
+         for (t=0;t<ktime;t++) {
+            int h;
+            for (h=0;h<kheight;h++) {
+               int j;
+               for (j=0;j<height;j++) {
+                  out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] *
+                                     in[t*in_channels*in_stride + m*in_stride + j + h];
+               }
+            }
+         }
+      }
+   }
+}
+
+/* There's no intrinsics in this function (or the one above) because the gcc (and hopefully other compiler) auto-vectorizer is smart enough to
+   produce the right code by itself based on the compile flags. */
+static void conv2d_3x3_float(float *out, const float *weights, int in_channels, int out_channels, const float *in, int height, int hstride)
+{
+   int i;
+   int in_stride;
+   int kheight, ktime;
+   kheight = ktime = 3;
+   in_stride = height+kheight-1;
+   for (i=0;i<out_channels;i++) {
+      int m;
+      OPUS_CLEAR(&out[i*hstride], height);
+      for (m=0;m<in_channels;m++) {
+         int j;
+         for (j=0;j<height;j++) {
+            /* Unrolled version of previous function -- compiler will figure out the indexing simplifications. */
+            out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 0]*in[0*in_channels*in_stride + m*in_stride + j + 0]
+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 1]*in[0*in_channels*in_stride + m*in_stride + j + 1]
+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 2]*in[0*in_channels*in_stride + m*in_stride + j + 2]
+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 0]*in[1*in_channels*in_stride + m*in_stride + j + 0]
+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 1]*in[1*in_channels*in_stride + m*in_stride + j + 1]
+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 2]*in[1*in_channels*in_stride + m*in_stride + j + 2]
+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 0]*in[2*in_channels*in_stride + m*in_stride + j + 0]
+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 1]*in[2*in_channels*in_stride + m*in_stride + j + 1]
+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 2]*in[2*in_channels*in_stride + m*in_stride + j + 2];
+               }
+      }
+   }
+}
+
+#define MAX_CONV2D_INPUTS 8192
+
+void RTCD_SUF(compute_conv2d_)(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation)
+{
+   int i;
+   const float *bias;
+   float in_buf[MAX_CONV2D_INPUTS];
+   int time_stride;
+   celt_assert(in != out);
+   time_stride = conv->in_channels*(height+conv->kheight-1);
+   celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS);
+   OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride);
+   OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);
+   OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);
+   bias = conv->bias;
+   if (conv->kheight == 3 && conv->ktime == 3)
+     conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride);
+   else
+     conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);
+   if (bias != NULL) {
+     for (i=0;i<conv->out_channels;i++) {
+       int j;
+       for (j=0;j<height;j++) out[i*hstride+j] += bias[i];
+     }
+   }
+   for (i=0;i<conv->out_channels;i++) {
+     RTCD_SUF(compute_activation_)(&out[i*hstride], &out[i*hstride], height, activation);
+   }
+}
 
 #endif
diff --git a/dnn/x86/dnn_x86.h b/dnn/x86/dnn_x86.h
index 94f95ce8..f2183327 100644
--- a/dnn/x86/dnn_x86.h
+++ b/dnn/x86/dnn_x86.h
@@ -34,16 +34,19 @@
 #if defined(OPUS_X86_MAY_HAVE_SSE2)
 void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in);
 void compute_activation_sse2(float *output, const float *input, int N, int activation);
+void compute_conv2d_sse2(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);
 #endif
 
 #if defined(OPUS_X86_MAY_HAVE_SSE4_1)
 void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in);
 void compute_activation_sse4_1(float *output, const float *input, int N, int activation);
+void compute_conv2d_sse4_1(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);
 #endif
 
 #if defined(OPUS_X86_MAY_HAVE_AVX2)
 void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in);
 void compute_activation_avx2(float *output, const float *input, int N, int activation);
+void compute_conv2d_avx2(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);
 #endif
 
 
@@ -53,6 +56,8 @@ void compute_activation_avx2(float *output, const float *input, int N, int activ
 #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in))
 #define OVERRIDE_COMPUTE_ACTIVATION
 #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_avx2(output, input, N, activation))
+#define OVERRIDE_COMPUTE_CONV2D
+#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_avx2(conv, out, mem, in, height, hstride, activation))
 
 #elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
 
@@ -60,6 +65,8 @@ void compute_activation_avx2(float *output, const float *input, int N, int activ
 #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in))
 #define OVERRIDE_COMPUTE_ACTIVATION
 #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse4_1(output, input, N, activation))
+#define OVERRIDE_COMPUTE_CONV2D
+#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_sse4_1(conv, out, mem, in, height, hstride, activation))
 
 #elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
 
@@ -67,6 +74,8 @@ void compute_activation_avx2(float *output, const float *input, int N, int activ
 #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in))
 #define OVERRIDE_COMPUTE_ACTIVATION
 #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse2(output, input, N, activation))
+#define OVERRIDE_COMPUTE_CONV2D
+#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_sse2(conv, out, mem, in, height, hstride, activation))
 
 #elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
 
@@ -91,6 +100,20 @@ extern void (*const DNN_COMPUTE_ACTIVATION_IMPL[OPUS_ARCHMASK + 1])(
     ((*DNN_COMPUTE_ACTIVATION_IMPL[(arch) & OPUS_ARCHMASK])(output, input, N, activation))
 
 
+extern void (*const DNN_COMPUTE_CONV2D_IMPL[OPUS_ARCHMASK + 1])(
+                    const Conv2dLayer *conv,
+                    float *out,
+                    float *mem,
+                    const float *in,
+                    int height,
+                    int hstride,
+                    int activation
+                    );
+#define OVERRIDE_COMPUTE_CONV2D
+#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) \
+    ((*DNN_COMPUTE_CONV2D_IMPL[(arch) & OPUS_ARCHMASK])(conv, out, mem, in, height, hstride, activation))
+
+
 #endif
 
 
diff --git a/dnn/x86/x86_dnn_map.c b/dnn/x86/x86_dnn_map.c
index f39ae372..d673e134 100644
--- a/dnn/x86/x86_dnn_map.c
+++ b/dnn/x86/x86_dnn_map.c
@@ -61,6 +61,22 @@ void (*const DNN_COMPUTE_ACTIVATION_IMPL[OPUS_ARCHMASK + 1])(
   MAY_HAVE_AVX2(compute_activation)  /* avx  */
 };
 
+void (*const DNN_COMPUTE_CONV2D_IMPL[OPUS_ARCHMASK + 1])(
+         const Conv2dLayer *conv,
+         float *out,
+         float *mem,
+         const float *in,
+         int height,
+         int hstride,
+         int activation
+) = {
+  compute_conv2d_c,                /* non-sse */
+  compute_conv2d_c,
+  MAY_HAVE_SSE2(compute_conv2d),
+  MAY_HAVE_SSE4_1(compute_conv2d), /* sse4.1  */
+  MAY_HAVE_AVX2(compute_conv2d)  /* avx  */
+};
+
 #endif
author	Jean-Marc Valin <jmvalin@amazon.com>	2023-11-17 22:14:03 +0300
committer	Jean-Marc Valin <jmvalin@amazon.com>	2023-11-17 22:20:09 +0300
commit	a93b09e2417ed191d87788d0dbf8b09d053fd59f (patch)
tree	17069d15b38863680a22aa84a5f590e68b9533e8
parent	91d1f7539ec1a12f6dabc366cec70faac5288b34 (diff)