Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYoung Jin Kim <youki@microsoft.com>2020-03-04 20:24:47 +0300
committerGitHub <noreply@github.com>2020-03-04 20:24:47 +0300
commitf78e60988329b9207d086c743cafce1ac1bea3ab (patch)
tree1b8b79a4d59d95186f61103d17b59a7b668d15b7
parent84e66a976046180187724aff60a236c5378fde7c (diff)
parentb7a88185fbe7661d1216b182bd343a1a7fe2e544 (diff)
Merge pull request #2 from XapaJIaMnu/restore_mac_support
Support mac again
-rw-r--r--include/fbgemm/Utils.h7
-rw-r--r--src/FbgemmI8Depthwise3DAvx2.cc8
-rw-r--r--src/FbgemmI8DepthwiseAvx2.cc6
-rw-r--r--src/FbgemmI8Spmdm.cc12
-rw-r--r--src/PackDepthwiseConvMatrixAvx2.cc16
-rwxr-xr-xsrc/Utils.cc24
6 files changed, 48 insertions, 25 deletions
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 3976790..2533a10 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -13,16 +13,15 @@
#ifdef _MSC_VER
# define ALWAYS_INLINE // __forceinline
-# define ALIGNED_MALLOC(size, alignment) _aligned_malloc(size, alignment)
-# define FREE(ptr) _aligned_free(ptr)
#else
# define ALWAYS_INLINE __attribute__((always_inline))
-# define ALIGNED_MALLOC(size, alignment) aligned_alloc(alignment, size)
-# define FREE(ptr) free(ptr)
#endif
namespace fbgemm {
+void * genericAlignedAlloc(size_t size, size_t alignment);
+void genericFree(void * ptr);
+
/**
* @brief Helper struct to type specialize for uint8 and int8 together.
*/
diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc
index 2114b20..7e12678 100644
--- a/src/FbgemmI8Depthwise3DAvx2.cc
+++ b/src/FbgemmI8Depthwise3DAvx2.cc
@@ -490,7 +490,7 @@ static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_(
//int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
int32_t* row_offsets
- = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64));
+ = static_cast<int32_t*>(genericAlignedAlloc((K + 31) / 32 * 32 * sizeof(int32_t), 64));
int n_begin, n_end;
int t_begin, t_end, h_begin, h_end;
@@ -568,7 +568,7 @@ static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_(
} // h
} // t
} // for each n
- FREE(row_offsets);
+ genericFree(row_offsets);
};
template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
@@ -606,7 +606,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
//int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
int32_t* row_offsets
- = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64));
+ = static_cast<int32_t*>(genericAlignedAlloc((K + 31) / 32 * 32 * sizeof(int32_t), 64));
int n_begin, n_end;
int t_begin, t_end, h_begin, h_end;
@@ -684,7 +684,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
} // h
} // t
} // for each n
- FREE(row_offsets);
+ genericFree(row_offsets);
};
// Dispatch A_SYMMETRIC and B_SYMMETRIC
diff --git a/src/FbgemmI8DepthwiseAvx2.cc b/src/FbgemmI8DepthwiseAvx2.cc
index 994f206..8e875cc 100644
--- a/src/FbgemmI8DepthwiseAvx2.cc
+++ b/src/FbgemmI8DepthwiseAvx2.cc
@@ -344,7 +344,7 @@ static inline ALWAYS_INLINE void depthwise_3x3_pad_1_(
int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
const int8_t* Bp = B.PackedMat();
- int32_t* row_offsets = static_cast<int32_t *>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64));
+ int32_t* row_offsets = static_cast<int32_t *>(genericAlignedAlloc(((K + 31) / 32 * 32)*sizeof(int32_t), 64));
int n_begin, n_end;
int h_begin, h_end, w_begin, w_end;
@@ -655,7 +655,7 @@ static inline ALWAYS_INLINE void depthwise_3x3_pad_1_(
}
}
} // for each n
- FREE(row_offsets);
+ genericFree(row_offsets);
};
template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
@@ -687,7 +687,7 @@ depthwise_3x3_per_channel_quantization_pad_1_(
int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
const int8_t* Bp = B.PackedMat();
- int32_t* row_offsets = static_cast<int32_t*>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64)));
+ int32_t* row_offsets = static_cast<int32_t*>(genericAlignedAlloc(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64)));
int n_begin, n_end;
int h_begin, h_end, w_begin, w_end;
diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc
index edcc4e8..1555a55 100644
--- a/src/FbgemmI8Spmdm.cc
+++ b/src/FbgemmI8Spmdm.cc
@@ -151,15 +151,15 @@ void CompressedSparseColumn::SpMDM(
t_start = std::chrono::high_resolution_clock::now();
#endif
- uint8_t* A_buffer = static_cast<uint8_t*>(ALIGNED_MALLOC(K * 32 * sizeof(uint8_t), 64));
- int32_t* C_buffer = static_cast<int32_t*>(ALIGNED_MALLOC(N * 32 * sizeof(int32_t), 64));
+ uint8_t* A_buffer = static_cast<uint8_t*>(genericAlignedAlloc(K * 32 * sizeof(uint8_t), 64));
+ int32_t* C_buffer = static_cast<int32_t*>(genericAlignedAlloc(N * 32 * sizeof(int32_t), 64));
// Take 32 rows at a time
int i_end = block.row_start + block.row_size;
for (int i1 = block.row_start; i1 < i_end; i1 += 32) {
// Transpose 32 x K submatrix of A
if (i_end - i1 < 32) {
- uint8_t* A_temp_buffer = static_cast<uint8_t*>(ALIGNED_MALLOC(K * 32 * sizeof(uint8_t), 64));
+ uint8_t* A_temp_buffer = static_cast<uint8_t*>(genericAlignedAlloc(K * 32 * sizeof(uint8_t), 64));
for (int i2 = 0; i2 < (i_end - i1) / 8 * 8; i2 += 8) {
transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32);
}
@@ -175,7 +175,7 @@ void CompressedSparseColumn::SpMDM(
for (int i2 = (i_end - i1) / 8 * 8; i2 < 32; i2 += 8) {
transpose_8rows(K, A_temp_buffer + i2 * K, K, A_buffer + i2, 32);
}
- FREE(A_temp_buffer);
+ genericFree(A_temp_buffer);
} else {
for (int i2 = 0; i2 < 32; i2 += 8) {
transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32);
@@ -254,8 +254,8 @@ void CompressedSparseColumn::SpMDM(
t_start = std::chrono::high_resolution_clock::now();
#endif
- FREE(A_buffer);
- FREE(C_buffer);
+ genericFree(A_buffer);
+ genericFree(C_buffer);
}
void CompressedSparseColumn::SparseConv(
diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc
index a84c469..126b93c 100644
--- a/src/PackDepthwiseConvMatrixAvx2.cc
+++ b/src/PackDepthwiseConvMatrixAvx2.cc
@@ -36,7 +36,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
: K_(K), kernel_prod_(kernel_prod) {
// Transpose the input matrix to make packing faster.
int8_t* smat_transposed
- = static_cast<int8_t*>(ALIGNED_MALLOC(K * kernel_prod * sizeof(int8_t), 64));
+ = static_cast<int8_t*>(genericAlignedAlloc(K * kernel_prod * sizeof(int8_t), 64));
for (int i = 0; i < kernel_prod; ++i) {
for (int j = 0; j < K; ++j) {
@@ -101,7 +101,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
// (12, 8), (12, 9), (12, 10), zero, ..., (15, 8), (15, 9), (15, 10), zero
// (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero
for (int k1 = 0; k1 < K; k1 += 32) {
- __m256i* b_v = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod * sizeof(__m256i), 64));
+ __m256i* b_v = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod * sizeof(__m256i), 64));
int remainder = K - k1;
if (remainder < 32) {
__m256i mask_v = _mm256_loadu_si256(
@@ -118,7 +118,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
}
// Interleave 2 SIMD registers
- __m256i* b_interleaved_epi16 = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod_aligned * sizeof(__m256i), 64));
+ __m256i* b_interleaved_epi16 = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod_aligned * sizeof(__m256i), 64));
__m256i zero_v = _mm256_setzero_si256();
for (int i = 0; i < kernel_prod_aligned / 2; ++i) {
if (2 * i + 1 >= kernel_prod) {
@@ -134,7 +134,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
}
// Interleave 4 SIMD registers
- __m256i* b_interleaved_epi32 = static_cast<__m256i*>(ALIGNED_MALLOC(kernel_prod_aligned * sizeof(__m256i), 64));
+ __m256i* b_interleaved_epi32 = static_cast<__m256i*>(genericAlignedAlloc(kernel_prod_aligned * sizeof(__m256i), 64));
for (int i = 0; i < kernel_prod_aligned / 4; ++i) {
b_interleaved_epi32[4 * i] = _mm256_unpacklo_epi16(
b_interleaved_epi16[4 * i], b_interleaved_epi16[4 * i + 2]);
@@ -156,11 +156,11 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
b_interleaved_epi32[i]);
}
- FREE(b_v);
- FREE(b_interleaved_epi16);
- FREE(b_interleaved_epi32);
+ genericFree(b_v);
+ genericFree(b_interleaved_epi16);
+ genericFree(b_interleaved_epi32);
}
- FREE(smat_transposed);
+ genericFree(smat_transposed);
}
int PackedDepthWiseConvMatrix::addr(int r, int c) {
diff --git a/src/Utils.cc b/src/Utils.cc
index 2e88561..07d1f59 100755
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -17,6 +17,30 @@
namespace fbgemm {
+void * genericAlignedAlloc(size_t size, size_t align) {
+ void* aligned_mem = nullptr;
+ int ret;
+#ifdef _MSC_VER
+ aligned_mem = _aligned_malloc(size, align);
+ ret = 0;
+#else
+ ret = posix_memalign(&aligned_mem, align, size);
+#endif
+ // Throw std::bad_alloc in the case of memory allocation failure.
+ if (ret || aligned_mem == nullptr) {
+ throw std::bad_alloc();
+ }
+ return aligned_mem;
+}
+
+void genericFree(void * p) {
+#ifdef _MSC_VER
+ _aligned_free(p);
+#else
+ free(p);
+#endif
+}
+
/**
* @brief Compare the reference and test result matrix to check the correctness.
* @param ref The buffer for the reference result matrix.