diff options
author | Young Jin Kim <youki@microsoft.com> | 2020-03-04 20:24:47 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-03-04 20:24:47 +0300 |
commit | f78e60988329b9207d086c743cafce1ac1bea3ab (patch) | |
tree | 1b8b79a4d59d95186f61103d17b59a7b668d15b7 | |
parent | 84e66a976046180187724aff60a236c5378fde7c (diff) | |
parent | b7a88185fbe7661d1216b182bd343a1a7fe2e544 (diff) |
Merge pull request #2 from XapaJIaMnu/restore_mac_support
Support mac again
-rw-r--r-- | include/fbgemm/Utils.h | 7 | ||||
-rw-r--r-- | src/FbgemmI8Depthwise3DAvx2.cc | 8 | ||||
-rw-r--r-- | src/FbgemmI8DepthwiseAvx2.cc | 6 | ||||
-rw-r--r-- | src/FbgemmI8Spmdm.cc | 12 | ||||
-rw-r--r-- | src/PackDepthwiseConvMatrixAvx2.cc | 16 | ||||
-rwxr-xr-x | src/Utils.cc | 24 |
6 files changed, 48 insertions, 25 deletions
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h index 3976790..2533a10 100644 --- a/include/fbgemm/Utils.h +++ b/include/fbgemm/Utils.h @@ -13,16 +13,15 @@ #ifdef _MSC_VER # define ALWAYS_INLINE // __forceinline -# define ALIGNED_MALLOC(size, alignment) _aligned_malloc(size, alignment) -# define FREE(ptr) _aligned_free(ptr) #else # define ALWAYS_INLINE __attribute__((always_inline)) -# define ALIGNED_MALLOC(size, alignment) aligned_alloc(alignment, size) -# define FREE(ptr) free(ptr) #endif namespace fbgemm { +void * genericAlignedAlloc(size_t size, size_t alignment); +void genericFree(void * ptr); + /** * @brief Helper struct to type specialize for uint8 and int8 together. */ diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc index 2114b20..7e12678 100644 --- a/src/FbgemmI8Depthwise3DAvx2.cc +++ b/src/FbgemmI8Depthwise3DAvx2.cc @@ -490,7 +490,7 @@ static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_( //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); int32_t* row_offsets - = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64)); + = static_cast<int32_t*>(genericAlignedAlloc((K + 31) / 32 * 32 * sizeof(int32_t), 64)); int n_begin, n_end; int t_begin, t_end, h_begin, h_end; @@ -568,7 +568,7 @@ static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_( } // h } // t } // for each n - FREE(row_offsets); + genericFree(row_offsets); }; template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE> @@ -606,7 +606,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_( //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); int32_t* row_offsets - = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64)); + = static_cast<int32_t*>(genericAlignedAlloc((K + 31) / 32 * 32 * sizeof(int32_t), 64)); int n_begin, n_end; int t_begin, t_end, h_begin, h_end; @@ -684,7 +684,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_( } // h } // t } // for each n - FREE(row_offsets); + genericFree(row_offsets); }; // Dispatch A_SYMMETRIC and B_SYMMETRIC diff --git a/src/FbgemmI8DepthwiseAvx2.cc b/src/FbgemmI8DepthwiseAvx2.cc index 994f206..8e875cc 100644 --- a/src/FbgemmI8DepthwiseAvx2.cc +++ b/src/FbgemmI8DepthwiseAvx2.cc @@ -344,7 +344,7 @@ static inline ALWAYS_INLINE void depthwise_3x3_pad_1_( int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t* row_offsets = static_cast<int32_t *>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); + int32_t* row_offsets = static_cast<int32_t *>(genericAlignedAlloc(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); int n_begin, n_end; int h_begin, h_end, w_begin, w_end; @@ -655,7 +655,7 @@ static inline ALWAYS_INLINE void depthwise_3x3_pad_1_( } } } // for each n - FREE(row_offsets); + genericFree(row_offsets); }; template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE> @@ -687,7 +687,7 @@ depthwise_3x3_per_channel_quantization_pad_1_( int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t* row_offsets = static_cast<int32_t*>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64))); + int32_t* row_offsets = static_cast<int32_t*>(genericAlignedAlloc(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64))); int n_begin, n_end; int h_begin, h_end, w_begin, w_end; diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc index edcc4e8..1555a55 100644 --- a/src/FbgemmI8Spmdm.cc +++ b/src/FbgemmI8Spmdm.cc @@ -151,15 +151,15 @@ void CompressedSparseColumn::SpMDM( t_start = std::chrono::high_resolution_clock::now(); #endif - uint8_t* A_buffer = static_cast<uint8_t*>(ALIGNED_MALLOC(K * 32 * sizeof(uint8_t), 64)); - int32_t* C_buffer = static_cast<int32_t*>(ALIGNED_MALLOC(N * 32 * sizeof(int32_t), 64)); + uint8_t* A_buffer = static_cast<uint8_t*>(genericAlignedAlloc(K * 32 * sizeof(uint8_t), 64)); + int32_t* C_buffer = static_cast<int32_t*>(genericAlignedAlloc(N * 32 * sizeof(int32_t), 64)); // Take 32 rows at a time int i_end = block.row_start + block.row_size; for (int i1 = block.row_start; i1 < i_end; i1 += 32) { // Transpose 32 x K submatrix of A if (i_end - i1 < 32) { - uint8_t* A_temp_buffer = static_cast<uint8_t*>(ALIGNED_MALLOC(K * 32 * sizeof(uint8_t), 64)); + uint8_t* A_temp_buffer = static_cast<uint8_t*>(genericAlignedAlloc(K * 32 * sizeof(uint8_t), 64)); for (int i2 = 0; i2 < (i_end - i1) / 8 * 8; i2 += 8) { transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32); } @@ -175,7 +175,7 @@ void CompressedSparseColumn::SpMDM( for (int i2 = (i_end - i1) / 8 * 8; i2 < 32; i2 += 8) { transpose_8rows(K, A_temp_buffer + i2 * K, K, A_buffer + i2, 32); } - FREE(A_temp_buffer); + genericFree(A_temp_buffer); } else { for (int i2 = 0; i2 < 32; i2 += 8) { transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32); @@ -254,8 +254,8 @@ void CompressedSparseColumn::SpMDM( t_start = std::chrono::high_resolution_clock::now(); #endif - FREE(A_buffer); - FREE(C_buffer); + genericFree(A_buffer); + genericFree(C_buffer); } void CompressedSparseColumn::SparseConv( diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc index a84c469..126b93c 100644 --- a/src/PackDepthwiseConvMatrixAvx2.cc +++ b/src/PackDepthwiseConvMatrixAvx2.cc @@ -36,7 +36,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( : K_(K), kernel_prod_(kernel_prod) { // Transpose the input matrix to make packing faster. int8_t* smat_transposed - = static_cast<int8_t*>(ALIGNED_MALLOC(K * kernel_prod * sizeof(int8_t), 64)); + = static_cast<int8_t*>(genericAlignedAlloc(K * kernel_prod * sizeof(int8_t), 64)); for (int i = 0; i < kernel_prod; ++i) { for (int j = 0; j < K; ++j) { @@ -101,7 +101,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( // (12, 8), (12, 9), (12, 10), zero, ..., (15, 8), (15, 9), (15, 10), zero // (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero for (int k1 = 0; k1 < K; k1 += 32) { - __m256i* b_v = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod * sizeof(__m256i), 64)); + __m256i* b_v = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod * sizeof(__m256i), 64)); int remainder = K - k1; if (remainder < 32) { __m256i mask_v = _mm256_loadu_si256( @@ -118,7 +118,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( } // Interleave 2 SIMD registers - __m256i* b_interleaved_epi16 = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod_aligned * sizeof(__m256i), 64)); + __m256i* b_interleaved_epi16 = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod_aligned * sizeof(__m256i), 64)); __m256i zero_v = _mm256_setzero_si256(); for (int i = 0; i < kernel_prod_aligned / 2; ++i) { if (2 * i + 1 >= kernel_prod) { @@ -134,7 +134,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( } // Interleave 4 SIMD registers - __m256i* b_interleaved_epi32 = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod_aligned * sizeof(__m256i), 64)); + __m256i* b_interleaved_epi32 = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod_aligned * sizeof(__m256i), 64)); for (int i = 0; i < kernel_prod_aligned / 4; ++i) { b_interleaved_epi32[4 * i] = _mm256_unpacklo_epi16( b_interleaved_epi16[4 * i], b_interleaved_epi16[4 * i + 2]); @@ -156,11 +156,11 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( b_interleaved_epi32[i]); } - FREE(b_v); - FREE(b_interleaved_epi16); - FREE(b_interleaved_epi32); + genericFree(b_v); + genericFree(b_interleaved_epi16); + genericFree(b_interleaved_epi32); } - FREE(smat_transposed); + genericFree(smat_transposed); } int PackedDepthWiseConvMatrix::addr(int r, int c) { diff --git a/src/Utils.cc b/src/Utils.cc index 2e88561..07d1f59 100755 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -17,6 +17,30 @@ namespace fbgemm { +void * genericAlignedAlloc(size_t size, size_t align) { + void* aligned_mem = nullptr; + int ret; +#ifdef _MSC_VER + aligned_mem = _aligned_malloc(size, align); + ret = 0; +#else + ret = posix_memalign(&aligned_mem, align, size); +#endif + // Throw std::bad_alloc in the case of memory allocation failure. + if (ret || aligned_mem == nullptr) { + throw std::bad_alloc(); + } + return aligned_mem; +} + +void genericFree(void * p) { +#ifdef _MSC_VER + _aligned_free(p); +#else + free(p); +#endif +} + /** * @brief Compare the reference and test result matrix to check the correctness. * @param ref The buffer for the reference result matrix. |