Fix windows build errors

author: Young Jin Kim <youki@microsoft.com> 2019-09-25 19:43:01 +0300
committer: Young Jin Kim <youki@microsoft.com> 2019-09-25 19:43:01 +0300
commit: d02815ffedbc46a3f8af1a3884efefd83668a401 (patch)
tree: c48612db7a801b26b1a966b00fea3d35b17e23ce
parent: 7bd598c9e97871e42c19449fddf7bd317898eb58 (diff)
4 files changed, 44 insertions, 27 deletions
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 3976790..7cb86d4 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -16,7 +16,7 @@
 # define ALIGNED_MALLOC(size, alignment) _aligned_malloc(size, alignment)
 # define FREE(ptr) _aligned_free(ptr)
 #else
-# define ALWAYS_INLINE __attribute__((always_inline))
+# define ALWAYS_INLINE ALWAYS_INLINE
 # define ALIGNED_MALLOC(size, alignment) aligned_alloc(alignment, size)
 # define FREE(ptr) free(ptr)
 #endif
diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc
index 925d265..2114b20 100644
--- a/src/FbgemmI8Depthwise3DAvx2.cc
+++ b/src/FbgemmI8Depthwise3DAvx2.cc
@@ -19,7 +19,7 @@ template <
     bool SUM_A,
     bool REMAINDER = false,
     bool PER_CHANNEL_QUANTIZATION = false>
-static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
+static inline ALWAYS_INLINE void inner_prod_3x3x3_packed_(
     int T,
     int H,
     int W,
@@ -272,7 +272,7 @@ template <
     bool A_SYMMETRIC,
     bool B_SYMMETRIC,
     typename BIAS_TYPE>
-static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
+static inline ALWAYS_INLINE void depthwise_3x3x3_kernel_(
     int T,
     int H,
     int W,
@@ -359,7 +359,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
 }
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
-static inline __attribute__((always_inline)) void
+static inline ALWAYS_INLINE void
 depthwise_3x3x3_per_channel_quantization_kernel_(
     int T,
     int H,
@@ -457,7 +457,7 @@ template <
     bool A_SYMMETRIC,
     bool B_SYMMETRIC,
     typename BIAS_TYPE>
-static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
+static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_(
     int N,
     int T,
     int H,
@@ -488,7 +488,9 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
   int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
   const int8_t* Bp = B.PackedMat();
 
-  int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+  //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+  int32_t* row_offsets
+      = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64));
 
   int n_begin, n_end;
   int t_begin, t_end, h_begin, h_end;
@@ -566,10 +568,11 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
       } // h
     } // t
   } // for each n
+  FREE(row_offsets);
 };
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
-static inline __attribute__((always_inline)) void
+static inline ALWAYS_INLINE void
 depthwise_3x3x3_per_channel_quantization_pad_1_(
     int N,
     int T,
@@ -601,7 +604,9 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
   int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
   const int8_t* Bp = B.PackedMat();
 
-  int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+  //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+  int32_t* row_offsets
+      = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64));
 
   int n_begin, n_end;
   int t_begin, t_end, h_begin, h_end;
@@ -679,6 +684,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
       } // h
     } // t
   } // for each n
+  FREE(row_offsets);
 };
 
 // Dispatch A_SYMMETRIC and B_SYMMETRIC
@@ -704,7 +710,7 @@ static void depthwise_3x3x3_pad_1_(
     float act_times_w_scale,
     int thread_id,
     int num_threads) {
-  int32_t C_int32_temp[(K + 31) / 32 * 32];
+  int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32];
   if (A_zero_point == 0 || col_offsets == nullptr) {
     if (B_zero_point == 0) {
       depthwise_3x3x3_pad_1_<
@@ -822,6 +828,7 @@ static void depthwise_3x3x3_pad_1_(
           num_threads);
     }
   }
+  delete[] C_int32_temp;
 }
 
 // Dispatch HAS_BIAS
@@ -1004,7 +1011,7 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
     const float* act_times_w_scale,
     int thread_id,
     int num_threads) {
-  int32_t C_int32_temp[(K + 31) / 32 * 32];
+  int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32];
   if (A_zero_point == 0 || col_offsets == nullptr) {
     depthwise_3x3x3_per_channel_quantization_pad_1_<
         FUSE_RELU,
@@ -1060,6 +1067,7 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
         thread_id,
         num_threads);
   }
+  delete[] C_int32_temp;
 }
 
 // Dispatch HAS_BIAS
diff --git a/src/FbgemmI8DepthwiseAvx2-inl.h b/src/FbgemmI8DepthwiseAvx2-inl.h
index 7ad39fc..aee9ab3 100644
--- a/src/FbgemmI8DepthwiseAvx2-inl.h
+++ b/src/FbgemmI8DepthwiseAvx2-inl.h
@@ -13,6 +13,7 @@
 #include <type_traits> // for is_same
 
 #include <immintrin.h>
+#include "fbgemm/Utils.h"
 
 namespace fbgemm {
 
@@ -40,7 +41,7 @@ static int masks[8][8] = {
 // c2_v:  c[8:12], c[24:28]
 // c3_v: c[12:16], c[28:32]
 template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16x4_packed(
+static inline ALWAYS_INLINE void madd_epi16x4_packed(
     __m256i a0_v,
     __m256i a1_v,
     __m256i a2_v,
@@ -99,7 +100,7 @@ static inline __attribute__((always_inline)) void madd_epi16x4_packed(
 // c2_v:  c[8:12], c[24:28]
 // c3_v: c[12:16], c[28:32]
 template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16x3_packed(
+static inline ALWAYS_INLINE void madd_epi16x3_packed(
     __m256i a0_v,
     __m256i a1_v,
     __m256i a2_v,
@@ -159,7 +160,7 @@ static inline __attribute__((always_inline)) void madd_epi16x3_packed(
 // c2_v: c[16:20], c[20:24]
 // c3_v: c[24:28], c[28:32]
 template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16x2_packed(
+static inline ALWAYS_INLINE void madd_epi16x2_packed(
     __m256i a0_v,
     __m256i a1_v,
     const __m256i* b,
@@ -200,7 +201,7 @@ static inline __attribute__((always_inline)) void madd_epi16x2_packed(
 // c2_v: c[16:20], c[20:24]
 // c3_v: c[24:28], c[28:32]
 template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16_packed(
+static inline ALWAYS_INLINE void madd_epi16_packed(
     __m256i a_v,
     const __m256i* b,
     __m256i* c0_v,
@@ -235,7 +236,7 @@ static inline __attribute__((always_inline)) void madd_epi16_packed(
 
 // K is the number of accumulations we're doing
 template <int K, bool SUM_A = false, bool REMAINDER = false, bool ACC = false>
-static inline __attribute__((always_inline)) void inner_prod_packed_(
+static inline ALWAYS_INLINE void inner_prod_packed_(
     const __m256i* a_v,
     const __m256i* Bp,
     std::int32_t* C,
@@ -383,7 +384,7 @@ template <
     bool A_SYMMETRIC,
     bool B_SYMMETRIC,
     typename BIAS_TYPE>
-static inline __attribute__((always_inline)) void requantize_(
+static inline ALWAYS_INLINE void requantize_(
     std::int32_t A_zero_point,
     const float* C_multiplier,
     std::int32_t C_zero_point,
@@ -688,7 +689,7 @@ static inline __attribute__((always_inline)) void requantize_(
 }
 
 template <bool REMAINDER>
-static inline __attribute__((always_inline)) __m256i load_a(
+static inline ALWAYS_INLINE __m256i load_a(
     const std::uint8_t* A,
     __m256i mask_v) {
   if (REMAINDER) {
diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc
index 0e17bcd..ab2e1f2 100644
--- a/src/PackDepthwiseConvMatrixAvx2.cc
+++ b/src/PackDepthwiseConvMatrixAvx2.cc
@@ -5,6 +5,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include "fbgemm/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/Utils.h"
+#include "fbgemm/Fbgemm.h"
 
 #include <immintrin.h>
 
@@ -33,7 +35,9 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
     const int8_t* smat)
     : K_(K), kernel_prod_(kernel_prod) {
   // Transpose the input matrix to make packing faster.
-  alignas(64) int8_t smat_transposed[K * kernel_prod];
+  int8_t* smat_transposed
+      = static_cast<int8_t*>(ALIGNED_MALLOC(K * kernel_prod * sizeof(int8_t), 64));
+
   for (int i = 0; i < kernel_prod; ++i) {
     for (int j = 0; j < K; ++j) {
       smat_transposed[i * K + j] = smat[i + j * kernel_prod];
@@ -42,12 +46,11 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
 
   // Allocate packed arrays
   int kernel_prod_aligned = (kernel_prod + 1) / 2 * 2;
-  // pmat_ = static_cast<int8_t *>(fbgemmAlignedAlloc(
-  //     64, ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t)));
-  posix_memalign(
-      (void**)&pmat_,
-      64,
-      ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t));
+  pmat_ = static_cast<int8_t *>(fbgemmAlignedAlloc(64, ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t)));
+  //posix_memalign(
+  //    (void**)&pmat_,
+  //    64,
+  //    ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t));
 
   // Pack input matrix
   // The layout is optimized to use vpmaddubsw efficiently (see
@@ -102,7 +105,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
   // (12, 8), (12, 9), (12, 10), zero, ..., (15, 8), (15, 9), (15, 10), zero
   // (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero
   for (int k1 = 0; k1 < K; k1 += 32) {
-    __m256i b_v[kernel_prod];
+    __m256i* b_v = new __m256i[kernel_prod];
     int remainder = K - k1;
     if (remainder < 32) {
       __m256i mask_v = _mm256_loadu_si256(
@@ -119,7 +122,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
     }
 
     // Interleave 2 SIMD registers
-    __m256i b_interleaved_epi16[kernel_prod_aligned];
+    __m256i* b_interleaved_epi16 = new __m256i[kernel_prod_aligned];
     __m256i zero_v = _mm256_setzero_si256();
     for (int i = 0; i < kernel_prod_aligned / 2; ++i) {
       if (2 * i + 1 >= kernel_prod) {
@@ -135,7 +138,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
     }
 
     // Interleave 4 SIMD registers
-    __m256i b_interleaved_epi32[kernel_prod_aligned];
+    __m256i* b_interleaved_epi32 = new __m256i[kernel_prod_aligned];
     for (int i = 0; i < kernel_prod_aligned / 4; ++i) {
       b_interleaved_epi32[4 * i] = _mm256_unpacklo_epi16(
           b_interleaved_epi16[4 * i], b_interleaved_epi16[4 * i + 2]);
@@ -156,7 +159,12 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
               &pmat_[((k1 / 32) * kernel_prod_aligned + i) * 32]),
           b_interleaved_epi32[i]);
     }
+
+    delete[] b_v;
+    delete[] b_interleaved_epi16;
+    delete[] b_interleaved_epi32;
   }
+  FREE(smat_transposed);
 }
 
 int PackedDepthWiseConvMatrix::addr(int r, int c) {
author	Young Jin Kim <youki@microsoft.com>	2019-09-25 19:43:01 +0300
committer	Young Jin Kim <youki@microsoft.com>	2019-09-25 19:43:01 +0300
commit	d02815ffedbc46a3f8af1a3884efefd83668a401 (patch)
tree	c48612db7a801b26b1a966b00fea3d35b17e23ce
parent	7bd598c9e97871e42c19449fddf7bd317898eb58 (diff)