From b7a88185fbe7661d1216b182bd343a1a7fe2e544 Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Tue, 25 Feb 2020 18:10:59 +0000
Subject: Restore mac support

---
 include/fbgemm/Utils.h             |  7 +++----
 src/FbgemmI8Depthwise3DAvx2.cc     |  8 ++++----
 src/FbgemmI8DepthwiseAvx2.cc       |  6 +++---
 src/FbgemmI8Spmdm.cc               | 12 ++++++------
 src/PackDepthwiseConvMatrixAvx2.cc | 16 ++++++++--------
 src/Utils.cc                       | 24 ++++++++++++++++++++++++
 6 files changed, 48 insertions(+), 25 deletions(-)
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 3976790..2533a10 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -13,16 +13,15 @@
 
 #ifdef _MSC_VER
 # define ALWAYS_INLINE // __forceinline
-# define ALIGNED_MALLOC(size, alignment) _aligned_malloc(size, alignment)
-# define FREE(ptr) _aligned_free(ptr)
 #else
 # define ALWAYS_INLINE __attribute__((always_inline))
-# define ALIGNED_MALLOC(size, alignment) aligned_alloc(alignment, size)
-# define FREE(ptr) free(ptr)
 #endif
 
 namespace fbgemm {
 
+void * genericAlignedAlloc(size_t size, size_t alignment);
+void genericFree(void * ptr);
+
 /**
  * @brief Helper struct to type specialize for uint8 and int8 together.
  */
diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc
index 2114b20..7e12678 100644
--- a/src/FbgemmI8Depthwise3DAvx2.cc
+++ b/src/FbgemmI8Depthwise3DAvx2.cc
@@ -490,7 +490,7 @@ static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_(
 
   //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
   int32_t* row_offsets
-      = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64));
+      = static_cast<int32_t*>(genericAlignedAlloc((K + 31) / 32 * 32 * sizeof(int32_t), 64));
 
   int n_begin, n_end;
   int t_begin, t_end, h_begin, h_end;
@@ -568,7 +568,7 @@ static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_(
       } // h
     } // t
   } // for each n
-  FREE(row_offsets);
+  genericFree(row_offsets);
 };
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
@@ -606,7 +606,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
 
   //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
   int32_t* row_offsets
-      = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64));
+      = static_cast<int32_t*>(genericAlignedAlloc((K + 31) / 32 * 32 * sizeof(int32_t), 64));
 
   int n_begin, n_end;
   int t_begin, t_end, h_begin, h_end;
@@ -684,7 +684,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
       } // h
     } // t
   } // for each n
-  FREE(row_offsets);
+  genericFree(row_offsets);
 };
 
 // Dispatch A_SYMMETRIC and B_SYMMETRIC
diff --git a/src/FbgemmI8DepthwiseAvx2.cc b/src/FbgemmI8DepthwiseAvx2.cc
index 994f206..8e875cc 100644
--- a/src/FbgemmI8DepthwiseAvx2.cc
+++ b/src/FbgemmI8DepthwiseAvx2.cc
@@ -344,7 +344,7 @@ static inline ALWAYS_INLINE void depthwise_3x3_pad_1_(
   int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
   const int8_t* Bp = B.PackedMat();
 
-  int32_t* row_offsets = static_cast<int32_t *>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64));
+  int32_t* row_offsets = static_cast<int32_t *>(genericAlignedAlloc(((K + 31) / 32 * 32)*sizeof(int32_t), 64));
 
   int n_begin, n_end;
   int h_begin, h_end, w_begin, w_end;
@@ -655,7 +655,7 @@ static inline ALWAYS_INLINE void depthwise_3x3_pad_1_(
       }
     }
   } // for each n
-  FREE(row_offsets);
+  genericFree(row_offsets);
 };
 
 template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
@@ -687,7 +687,7 @@ depthwise_3x3_per_channel_quantization_pad_1_(
   int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
   const int8_t* Bp = B.PackedMat();
 
-  int32_t* row_offsets = static_cast<int32_t*>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64)));
+  int32_t* row_offsets = static_cast<int32_t*>(genericAlignedAlloc(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64)));
 
   int n_begin, n_end;
   int h_begin, h_end, w_begin, w_end;
diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc
index edcc4e8..1555a55 100644
--- a/src/FbgemmI8Spmdm.cc
+++ b/src/FbgemmI8Spmdm.cc
@@ -151,15 +151,15 @@ void CompressedSparseColumn::SpMDM(
   t_start = std::chrono::high_resolution_clock::now();
 #endif
 
-  uint8_t* A_buffer = static_cast<uint8_t*>(ALIGNED_MALLOC(K * 32 * sizeof(uint8_t), 64));
-  int32_t* C_buffer = static_cast<int32_t*>(ALIGNED_MALLOC(N * 32 * sizeof(int32_t), 64));
+  uint8_t* A_buffer = static_cast<uint8_t*>(genericAlignedAlloc(K * 32 * sizeof(uint8_t), 64));
+  int32_t* C_buffer = static_cast<int32_t*>(genericAlignedAlloc(N * 32 * sizeof(int32_t), 64));
 
   // Take 32 rows at a time
   int i_end = block.row_start + block.row_size;
   for (int i1 = block.row_start; i1 < i_end; i1 += 32) {
     // Transpose 32 x K submatrix of A
     if (i_end - i1 < 32) {
-      uint8_t* A_temp_buffer = static_cast<uint8_t*>(ALIGNED_MALLOC(K * 32 * sizeof(uint8_t), 64));
+      uint8_t* A_temp_buffer = static_cast<uint8_t*>(genericAlignedAlloc(K * 32 * sizeof(uint8_t), 64));
       for (int i2 = 0; i2 < (i_end - i1) / 8 * 8; i2 += 8) {
         transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32);
       }
@@ -175,7 +175,7 @@ void CompressedSparseColumn::SpMDM(
       for (int i2 = (i_end - i1) / 8 * 8; i2 < 32; i2 += 8) {
         transpose_8rows(K, A_temp_buffer + i2 * K, K, A_buffer + i2, 32);
       }
-      FREE(A_temp_buffer);
+      genericFree(A_temp_buffer);
     } else {
       for (int i2 = 0; i2 < 32; i2 += 8) {
         transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32);
@@ -254,8 +254,8 @@ void CompressedSparseColumn::SpMDM(
   t_start = std::chrono::high_resolution_clock::now();
 #endif
 
-  FREE(A_buffer);
-  FREE(C_buffer);
+  genericFree(A_buffer);
+  genericFree(C_buffer);
 }
 
 void CompressedSparseColumn::SparseConv(
diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc
index a84c469..126b93c 100644
--- a/src/PackDepthwiseConvMatrixAvx2.cc
+++ b/src/PackDepthwiseConvMatrixAvx2.cc
@@ -36,7 +36,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
     : K_(K), kernel_prod_(kernel_prod) {
   // Transpose the input matrix to make packing faster.
   int8_t* smat_transposed
-      = static_cast<int8_t*>(ALIGNED_MALLOC(K * kernel_prod * sizeof(int8_t), 64));
+      = static_cast<int8_t*>(genericAlignedAlloc(K * kernel_prod * sizeof(int8_t), 64));
 
   for (int i = 0; i < kernel_prod; ++i) {
     for (int j = 0; j < K; ++j) {
@@ -101,7 +101,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
   // (12, 8), (12, 9), (12, 10), zero, ..., (15, 8), (15, 9), (15, 10), zero
   // (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero
   for (int k1 = 0; k1 < K; k1 += 32) {
-    __m256i* b_v = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod * sizeof(__m256i), 64));
+    __m256i* b_v = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod * sizeof(__m256i), 64));
     int remainder = K - k1;
     if (remainder < 32) {
       __m256i mask_v = _mm256_loadu_si256(
@@ -118,7 +118,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
     }
 
     // Interleave 2 SIMD registers
-    __m256i* b_interleaved_epi16 = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod_aligned * sizeof(__m256i), 64));
+    __m256i* b_interleaved_epi16 = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod_aligned * sizeof(__m256i), 64));
     __m256i zero_v = _mm256_setzero_si256();
     for (int i = 0; i < kernel_prod_aligned / 2; ++i) {
       if (2 * i + 1 >= kernel_prod) {
@@ -134,7 +134,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
     }
 
     // Interleave 4 SIMD registers
-    __m256i* b_interleaved_epi32 = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod_aligned * sizeof(__m256i), 64));
+    __m256i* b_interleaved_epi32 = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod_aligned * sizeof(__m256i), 64));
     for (int i = 0; i < kernel_prod_aligned / 4; ++i) {
       b_interleaved_epi32[4 * i] = _mm256_unpacklo_epi16(
           b_interleaved_epi16[4 * i], b_interleaved_epi16[4 * i + 2]);
@@ -156,11 +156,11 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
           b_interleaved_epi32[i]);
     }
 
-    FREE(b_v);
-    FREE(b_interleaved_epi16);
-    FREE(b_interleaved_epi32);
+    genericFree(b_v);
+    genericFree(b_interleaved_epi16);
+    genericFree(b_interleaved_epi32);
   }
-  FREE(smat_transposed);
+  genericFree(smat_transposed);
 }
 
 int PackedDepthWiseConvMatrix::addr(int r, int c) {
diff --git a/src/Utils.cc b/src/Utils.cc
index 2e88561..07d1f59 100755
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -17,6 +17,30 @@
 
 namespace fbgemm {
 
+void * genericAlignedAlloc(size_t size, size_t align) {
+  void* aligned_mem = nullptr;
+  int ret;
+#ifdef _MSC_VER
+  aligned_mem = _aligned_malloc(size, align);
+  ret = 0;
+#else
+  ret = posix_memalign(&aligned_mem, align, size);
+#endif
+  // Throw std::bad_alloc in the case of memory allocation failure.
+  if (ret || aligned_mem == nullptr) {
+    throw std::bad_alloc();
+  }
+  return aligned_mem;
+}
+
+void genericFree(void * p) {
+#ifdef _MSC_VER
+  _aligned_free(p);
+#else
+  free(p);
+#endif
+}
+
 /**
  * @brief Compare the reference and test result matrix to check the correctness.
  * @param ref The buffer for the reference result matrix.
-- 
cgit v1.2.3