From b7a88185fbe7661d1216b182bd343a1a7fe2e544 Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Tue, 25 Feb 2020 18:10:59 +0000 Subject: Restore mac support --- include/fbgemm/Utils.h | 7 +++---- src/FbgemmI8Depthwise3DAvx2.cc | 8 ++++---- src/FbgemmI8DepthwiseAvx2.cc | 6 +++--- src/FbgemmI8Spmdm.cc | 12 ++++++------ src/PackDepthwiseConvMatrixAvx2.cc | 16 ++++++++-------- src/Utils.cc | 24 ++++++++++++++++++++++++ 6 files changed, 48 insertions(+), 25 deletions(-) diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h index 3976790..2533a10 100644 --- a/include/fbgemm/Utils.h +++ b/include/fbgemm/Utils.h @@ -13,16 +13,15 @@ #ifdef _MSC_VER # define ALWAYS_INLINE // __forceinline -# define ALIGNED_MALLOC(size, alignment) _aligned_malloc(size, alignment) -# define FREE(ptr) _aligned_free(ptr) #else # define ALWAYS_INLINE __attribute__((always_inline)) -# define ALIGNED_MALLOC(size, alignment) aligned_alloc(alignment, size) -# define FREE(ptr) free(ptr) #endif namespace fbgemm { +void * genericAlignedAlloc(size_t size, size_t alignment); +void genericFree(void * ptr); + /** * @brief Helper struct to type specialize for uint8 and int8 together. */ diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc index 2114b20..7e12678 100644 --- a/src/FbgemmI8Depthwise3DAvx2.cc +++ b/src/FbgemmI8Depthwise3DAvx2.cc @@ -490,7 +490,7 @@ static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_( //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); int32_t* row_offsets - = static_cast(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64)); + = static_cast(genericAlignedAlloc((K + 31) / 32 * 32 * sizeof(int32_t), 64)); int n_begin, n_end; int t_begin, t_end, h_begin, h_end; @@ -568,7 +568,7 @@ static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_( } // h } // t } // for each n - FREE(row_offsets); + genericFree(row_offsets); }; template @@ -606,7 +606,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_( //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); int32_t* row_offsets - = static_cast(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64)); + = static_cast(genericAlignedAlloc((K + 31) / 32 * 32 * sizeof(int32_t), 64)); int n_begin, n_end; int t_begin, t_end, h_begin, h_end; @@ -684,7 +684,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_( } // h } // t } // for each n - FREE(row_offsets); + genericFree(row_offsets); }; // Dispatch A_SYMMETRIC and B_SYMMETRIC diff --git a/src/FbgemmI8DepthwiseAvx2.cc b/src/FbgemmI8DepthwiseAvx2.cc index 994f206..8e875cc 100644 --- a/src/FbgemmI8DepthwiseAvx2.cc +++ b/src/FbgemmI8DepthwiseAvx2.cc @@ -344,7 +344,7 @@ static inline ALWAYS_INLINE void depthwise_3x3_pad_1_( int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t* row_offsets = static_cast(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); + int32_t* row_offsets = static_cast(genericAlignedAlloc(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); int n_begin, n_end; int h_begin, h_end, w_begin, w_end; @@ -655,7 +655,7 @@ static inline ALWAYS_INLINE void depthwise_3x3_pad_1_( } } } // for each n - FREE(row_offsets); + genericFree(row_offsets); }; template @@ -687,7 +687,7 @@ depthwise_3x3_per_channel_quantization_pad_1_( int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t* row_offsets = static_cast(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64))); + int32_t* row_offsets = static_cast(genericAlignedAlloc(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64))); int n_begin, n_end; int h_begin, h_end, w_begin, w_end; diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc index edcc4e8..1555a55 100644 --- a/src/FbgemmI8Spmdm.cc +++ b/src/FbgemmI8Spmdm.cc @@ -151,15 +151,15 @@ void CompressedSparseColumn::SpMDM( t_start = std::chrono::high_resolution_clock::now(); #endif - uint8_t* A_buffer = static_cast(ALIGNED_MALLOC(K * 32 * sizeof(uint8_t), 64)); - int32_t* C_buffer = static_cast(ALIGNED_MALLOC(N * 32 * sizeof(int32_t), 64)); + uint8_t* A_buffer = static_cast(genericAlignedAlloc(K * 32 * sizeof(uint8_t), 64)); + int32_t* C_buffer = static_cast(genericAlignedAlloc(N * 32 * sizeof(int32_t), 64)); // Take 32 rows at a time int i_end = block.row_start + block.row_size; for (int i1 = block.row_start; i1 < i_end; i1 += 32) { // Transpose 32 x K submatrix of A if (i_end - i1 < 32) { - uint8_t* A_temp_buffer = static_cast(ALIGNED_MALLOC(K * 32 * sizeof(uint8_t), 64)); + uint8_t* A_temp_buffer = static_cast(genericAlignedAlloc(K * 32 * sizeof(uint8_t), 64)); for (int i2 = 0; i2 < (i_end - i1) / 8 * 8; i2 += 8) { transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32); } @@ -175,7 +175,7 @@ void CompressedSparseColumn::SpMDM( for (int i2 = (i_end - i1) / 8 * 8; i2 < 32; i2 += 8) { transpose_8rows(K, A_temp_buffer + i2 * K, K, A_buffer + i2, 32); } - FREE(A_temp_buffer); + genericFree(A_temp_buffer); } else { for (int i2 = 0; i2 < 32; i2 += 8) { transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32); @@ -254,8 +254,8 @@ void CompressedSparseColumn::SpMDM( t_start = std::chrono::high_resolution_clock::now(); #endif - FREE(A_buffer); - FREE(C_buffer); + genericFree(A_buffer); + genericFree(C_buffer); } void CompressedSparseColumn::SparseConv( diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc index a84c469..126b93c 100644 --- a/src/PackDepthwiseConvMatrixAvx2.cc +++ b/src/PackDepthwiseConvMatrixAvx2.cc @@ -36,7 +36,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( : K_(K), kernel_prod_(kernel_prod) { // Transpose the input matrix to make packing faster. int8_t* smat_transposed - = static_cast(ALIGNED_MALLOC(K * kernel_prod * sizeof(int8_t), 64)); + = static_cast(genericAlignedAlloc(K * kernel_prod * sizeof(int8_t), 64)); for (int i = 0; i < kernel_prod; ++i) { for (int j = 0; j < K; ++j) { @@ -101,7 +101,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( // (12, 8), (12, 9), (12, 10), zero, ..., (15, 8), (15, 9), (15, 10), zero // (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero for (int k1 = 0; k1 < K; k1 += 32) { - __m256i* b_v = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod * sizeof(__m256i), 64)); + __m256i* b_v = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod * sizeof(__m256i), 64)); int remainder = K - k1; if (remainder < 32) { __m256i mask_v = _mm256_loadu_si256( @@ -118,7 +118,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( } // Interleave 2 SIMD registers - __m256i* b_interleaved_epi16 = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod_aligned * sizeof(__m256i), 64)); + __m256i* b_interleaved_epi16 = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod_aligned * sizeof(__m256i), 64)); __m256i zero_v = _mm256_setzero_si256(); for (int i = 0; i < kernel_prod_aligned / 2; ++i) { if (2 * i + 1 >= kernel_prod) { @@ -134,7 +134,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( } // Interleave 4 SIMD registers - __m256i* b_interleaved_epi32 = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod_aligned * sizeof(__m256i), 64)); + __m256i* b_interleaved_epi32 = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod_aligned * sizeof(__m256i), 64)); for (int i = 0; i < kernel_prod_aligned / 4; ++i) { b_interleaved_epi32[4 * i] = _mm256_unpacklo_epi16( b_interleaved_epi16[4 * i], b_interleaved_epi16[4 * i + 2]); @@ -156,11 +156,11 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( b_interleaved_epi32[i]); } - FREE(b_v); - FREE(b_interleaved_epi16); - FREE(b_interleaved_epi32); + genericFree(b_v); + genericFree(b_interleaved_epi16); + genericFree(b_interleaved_epi32); } - FREE(smat_transposed); + genericFree(smat_transposed); } int PackedDepthWiseConvMatrix::addr(int r, int c) { diff --git a/src/Utils.cc b/src/Utils.cc index 2e88561..07d1f59 100755 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -17,6 +17,30 @@ namespace fbgemm { +void * genericAlignedAlloc(size_t size, size_t align) { + void* aligned_mem = nullptr; + int ret; +#ifdef _MSC_VER + aligned_mem = _aligned_malloc(size, align); + ret = 0; +#else + ret = posix_memalign(&aligned_mem, align, size); +#endif + // Throw std::bad_alloc in the case of memory allocation failure. + if (ret || aligned_mem == nullptr) { + throw std::bad_alloc(); + } + return aligned_mem; +} + +void genericFree(void * p) { +#ifdef _MSC_VER + _aligned_free(p); +#else + free(p); +#endif +} + /** * @brief Compare the reference and test result matrix to check the correctness. * @param ref The buffer for the reference result matrix. -- cgit v1.2.3