1 files changed, 51 insertions, 29 deletions
diff --git a/src/FbgemmI8DepthwiseAvx2.cc b/src/FbgemmI8DepthwiseAvx2.cc
index ee39faf..f96d1d2 100644
--- a/src/FbgemmI8DepthwiseAvx2.cc
+++ b/src/FbgemmI8DepthwiseAvx2.cc
@@ -5,6 +5,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include "fbgemm/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/Utils.h"
 
 #include <algorithm> // for min and max
 #include <cassert>
@@ -36,7 +37,8 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix(
     const int8_t* smat)
     : K_(K) {
   // Transpose the input matrix to make packing faster.
-  alignas(64) int8_t smat_transposed[K * KERNEL_PROD];
+  int8_t* smat_transposed = static_cast<int8_t *>(ALIGNED_MALLOC(
+      K * KERNEL_PROD * sizeof(int8_t), 64));
   for (int i = 0; i < KERNEL_PROD; ++i) {
     for (int j = 0; j < K; ++j) {
       smat_transposed[i * K + j] = smat[i + j * KERNEL_PROD];
@@ -45,12 +47,15 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix(
 
   // Allocate packed arrays
   constexpr int KERNEL_PROD_ALIGNED = (KERNEL_PROD + 1) / 2 * 2;
-  // pmat_ = static_cast<int8_t *>(fbgemmAlignedAlloc(
-  //     64, ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t)));
+#ifdef _MSC_VER
+  pmat_ = static_cast<int8_t *>(_aligned_malloc(
+      ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t), 64));
+#else
   posix_memalign(
       (void**)&pmat_,
       64,
       ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t));
+#endif
 
   // Pack input matrix
   // The layout is optimized to use vpmaddubsw efficiently (see
@@ -160,11 +165,17 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix(
           b_interleaved_epi32[i]);
     }
   }
+
+  FREE(smat_transposed);
 }
 
 template <int KERNEL_PROD>
 PackedDepthWiseConvMatrix<KERNEL_PROD>::~PackedDepthWiseConvMatrix() {
+#ifdef _MSC_VER
+  _aligned_free(pmat_);
+#else
   free(pmat_);
+#endif
 }
 
 template class PackedDepthWiseConvMatrix<3 * 3>;
@@ -179,7 +190,7 @@ template class PackedDepthWiseConvMatrix<3 * 3 * 3>;
 // c2_v:  c[8:12], c[24:28]
 // c3_v: c[12:16], c[28:32]
 template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16x4_packed(
+static inline ALWAYS_INLINE void madd_epi16x4_packed(
     __m256i a0_v,
     __m256i a1_v,
     __m256i a2_v,
@@ -238,7 +249,7 @@ static inline __attribute__((always_inline)) void madd_epi16x4_packed(
 // c2_v:  c[8:12], c[24:28]
 // c3_v: c[12:16], c[28:32]
 template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16x3_packed(
+static inline ALWAYS_INLINE void madd_epi16x3_packed(
     __m256i a0_v,
     __m256i a1_v,
     __m256i a2_v,
@@ -298,7 +309,7 @@ static inline __attribute__((always_inline)) void madd_epi16x3_packed(
 // c2_v: c[16:20], c[20:24]
 // c3_v: c[24:28], c[28:32]
 template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16x2_packed(
+static inline ALWAYS_INLINE void madd_epi16x2_packed(
     __m256i a0_v,
     __m256i a1_v,
     const __m256i* b,
@@ -339,7 +350,7 @@ static inline __attribute__((always_inline)) void madd_epi16x2_packed(
 // c2_v: c[16:20], c[20:24]
 // c3_v: c[24:28], c[28:32]
 template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16_packed(
+static inline ALWAYS_INLINE void madd_epi16_packed(
     __m256i a_v,
     const __m256i* b,
     __m256i* c0_v,
@@ -374,7 +385,7 @@ static inline __attribute__((always_inline)) void madd_epi16_packed(
 
 // K is the number of accumulations we're doing
 template <int K, bool SUM_A = false, bool REMAINDER = false, bool ACC = false>
-static inline __attribute__((always_inline)) void inner_prod_packed_(
+static inline ALWAYS_INLINE void inner_prod_packed_(
     const __m256i* a_v,
     const __m256i* Bp,
     int32_t* C,
@@ -514,7 +525,7 @@ static inline __attribute__((always_inline)) void inner_prod_packed_(
 }
 
 template <bool SUM_A = false, bool REMAINDER = false>
-static inline __attribute__((always_inline)) void inner_prod_3x3_packed_(
+static inline ALWAYS_INLINE void inner_prod_3x3_packed_(
     const __m256i* a_v,
     const __m256i* Bp,
     int32_t* C,
@@ -531,7 +542,7 @@ template <
     bool PER_CHANNEL_QUANTIZATION,
     bool A_SYMMETRIC,
     bool B_SYMMETRIC>
-static inline __attribute__((always_inline)) void requantize_(
+static inline ALWAYS_INLINE void requantize_(
     int32_t A_zero_point,
     const float* C_multiplier,
     int32_t C_zero_point,
@@ -745,7 +756,7 @@ static inline __attribute__((always_inline)) void requantize_(
 }
 
 template <bool REMAINDER>
-static inline __attribute__((always_inline)) __m256i load_a(
+static inline ALWAYS_INLINE __m256i load_a(
     const uint8_t* A,
     __m256i mask_v) {
   if (REMAINDER) {
@@ -759,7 +770,7 @@ template <
     bool SUM_A,
     bool REMAINDER = false,
     bool PER_CHANNEL_QUANTIZATION = false>
-static inline __attribute__((always_inline)) void inner_prod_3x3_packed_(
+static inline ALWAYS_INLINE void inner_prod_3x3_packed_(
     int H,
     int W,
     int K,
@@ -870,7 +881,7 @@ template <
     bool SUM_A,
     bool REMAINDER = false,
     bool PER_CHANNEL_QUANTIZATION = false>
-static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
+static inline ALWAYS_INLINE void inner_prod_3x3x3_packed_(
     int T,
     int H,
     int W,
@@ -1118,7 +1129,7 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
 }
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
-static inline __attribute__((always_inline)) void depthwise_3x3_kernel_(
+static inline ALWAYS_INLINE void depthwise_3x3_kernel_(
     int H,
     int W,
     int K,
@@ -1194,7 +1205,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3_kernel_(
 }
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
-static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
+static inline ALWAYS_INLINE void depthwise_3x3x3_kernel_(
     int T,
     int H,
     int W,
@@ -1279,7 +1290,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
 }
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
-static inline __attribute__((always_inline)) void
+static inline ALWAYS_INLINE void
 depthwise_3x3_per_channel_quantization_kernel_(
     int H,
     int W,
@@ -1362,7 +1373,7 @@ depthwise_3x3_per_channel_quantization_kernel_(
 }
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
-static inline __attribute__((always_inline)) void
+static inline ALWAYS_INLINE void
 depthwise_3x3x3_per_channel_quantization_kernel_(
     int T,
     int H,
@@ -1465,7 +1476,7 @@ static pair<int, int> closest_factors_(int n) {
 // filter shapes by parameterizing with R and S but restricting it to just 3x3
 // for now.
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
-static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
+static inline ALWAYS_INLINE void depthwise_3x3_pad_1_(
     int N,
     int H,
     int W,
@@ -1491,7 +1502,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
   int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
   const int8_t* Bp = B.PackedMat();
 
-  int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+  int32_t* row_offsets = static_cast<int32_t *>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64));
 
   int n_begin, n_end;
   int h_begin, h_end, w_begin, w_end;
@@ -1748,10 +1759,11 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_(
       }
     }
   } // for each n
+  FREE(row_offsets);
 };
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC>
-static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
+static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_(
     int N,
     int T,
     int H,
@@ -1781,7 +1793,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
   int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
   const int8_t* Bp = B.PackedMat();
 
-  int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+  int32_t* row_offsets = static_cast<int32_t*>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64)));
 
   int n_begin, n_end;
   int t_begin, t_end, h_begin, h_end;
@@ -1858,10 +1870,12 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
       } // h
     } // t
   } // for each n
+
+  FREE(row_offsets);
 };
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
-static inline __attribute__((always_inline)) void
+static inline ALWAYS_INLINE void
 depthwise_3x3_per_channel_quantization_pad_1_(
     int N,
     int H,
@@ -1888,7 +1902,7 @@ depthwise_3x3_per_channel_quantization_pad_1_(
   int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
   const int8_t* Bp = B.PackedMat();
 
-  int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+  int32_t* row_offsets = static_cast<int32_t*>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64)));
 
   int n_begin, n_end;
   int h_begin, h_end, w_begin, w_end;
@@ -2172,10 +2186,12 @@ depthwise_3x3_per_channel_quantization_pad_1_(
       }
     }
   } // for each n
+
+  FREE(row_offsets);
 };
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC>
-static inline __attribute__((always_inline)) void
+static inline ALWAYS_INLINE void
 depthwise_3x3x3_per_channel_quantization_pad_1_(
     int N,
     int T,
@@ -2206,7 +2222,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
   int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
   const int8_t* Bp = B.PackedMat();
 
-  int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+  int32_t* row_offsets = static_cast<int32_t*>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64)));
 
   int n_begin, n_end;
   int t_begin, t_end, h_begin, h_end;
@@ -2282,6 +2298,8 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
       } // h
     } // t
   } // for each n
+
+  FREE(row_offsets);
 };
 
 // Dispatch A_SYMMETRIC and B_SYMMETRIC
@@ -2304,7 +2322,7 @@ static void depthwise_3x3_pad_1_(
     const int32_t* bias,
     int thread_id,
     int num_threads) {
-  int32_t C_int32_temp[(K + 31) / 32 * 32];
+  int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32];
   if (A_zero_point == 0 || col_offsets == nullptr) {
     if (B_zero_point == 0) {
       depthwise_3x3_pad_1_<
@@ -2406,6 +2424,7 @@ static void depthwise_3x3_pad_1_(
           num_threads);
     }
   }
+  delete[] C_int32_temp;
 }
 
 // Dispatch HAS_BIAS
@@ -2709,7 +2728,7 @@ static void depthwise_3x3x3_pad_1_(
     const int32_t* bias,
     int thread_id,
     int num_threads) {
-  int32_t C_int32_temp[(K + 31) / 32 * 32];
+  int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32];
   if (A_zero_point == 0 || col_offsets == nullptr) {
     if (B_zero_point == 0) {
       depthwise_3x3x3_pad_1_<
@@ -2819,6 +2838,7 @@ static void depthwise_3x3x3_pad_1_(
           num_threads);
     }
   }
+  delete[] C_int32_temp;
 }
 
 // Dispatch HAS_BIAS
@@ -2975,7 +2995,7 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
     const int32_t* bias,
     int thread_id,
     int num_threads) {
-  int32_t C_int32_temp[(K + 31) / 32 * 32];
+  int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32];
   if (A_zero_point == 0 || col_offsets == nullptr) {
     depthwise_3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
@@ -3023,6 +3043,7 @@ static void depthwise_3x3_per_channel_quantization_pad_1_(
         thread_id,
         num_threads);
   }
+  delete[] C_int32_temp;
 }
 
 // Dispatch HAS_BIAS
@@ -3329,7 +3350,7 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
     const int32_t* bias,
     int thread_id,
     int num_threads) {
-  int32_t C_int32_temp[(K + 31) / 32 * 32];
+  int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32];
   if (A_zero_point == 0 || col_offsets == nullptr) {
     depthwise_3x3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
@@ -3381,6 +3402,7 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
         thread_id,
         num_threads);
   }
+  delete[] C_int32_temp;
 }
 
 // Dispatch HAS_BIAS