Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYoung Jin Kim <youki@microsoft.com>2019-09-25 19:43:01 +0300
committerYoung Jin Kim <youki@microsoft.com>2019-09-25 19:43:01 +0300
commitd02815ffedbc46a3f8af1a3884efefd83668a401 (patch)
treec48612db7a801b26b1a966b00fea3d35b17e23ce
parent7bd598c9e97871e42c19449fddf7bd317898eb58 (diff)
Fix windows build errors
-rw-r--r--include/fbgemm/Utils.h2
-rw-r--r--src/FbgemmI8Depthwise3DAvx2.cc26
-rw-r--r--src/FbgemmI8DepthwiseAvx2-inl.h15
-rw-r--r--src/PackDepthwiseConvMatrixAvx2.cc28
4 files changed, 44 insertions, 27 deletions
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
index 3976790..7cb86d4 100644
--- a/include/fbgemm/Utils.h
+++ b/include/fbgemm/Utils.h
@@ -16,7 +16,7 @@
# define ALIGNED_MALLOC(size, alignment) _aligned_malloc(size, alignment)
# define FREE(ptr) _aligned_free(ptr)
#else
-# define ALWAYS_INLINE __attribute__((always_inline))
+# define ALWAYS_INLINE ALWAYS_INLINE
# define ALIGNED_MALLOC(size, alignment) aligned_alloc(alignment, size)
# define FREE(ptr) free(ptr)
#endif
diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc
index 925d265..2114b20 100644
--- a/src/FbgemmI8Depthwise3DAvx2.cc
+++ b/src/FbgemmI8Depthwise3DAvx2.cc
@@ -19,7 +19,7 @@ template <
bool SUM_A,
bool REMAINDER = false,
bool PER_CHANNEL_QUANTIZATION = false>
-static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
+static inline ALWAYS_INLINE void inner_prod_3x3x3_packed_(
int T,
int H,
int W,
@@ -272,7 +272,7 @@ template <
bool A_SYMMETRIC,
bool B_SYMMETRIC,
typename BIAS_TYPE>
-static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
+static inline ALWAYS_INLINE void depthwise_3x3x3_kernel_(
int T,
int H,
int W,
@@ -359,7 +359,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_(
}
template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
-static inline __attribute__((always_inline)) void
+static inline ALWAYS_INLINE void
depthwise_3x3x3_per_channel_quantization_kernel_(
int T,
int H,
@@ -457,7 +457,7 @@ template <
bool A_SYMMETRIC,
bool B_SYMMETRIC,
typename BIAS_TYPE>
-static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
+static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_(
int N,
int T,
int H,
@@ -488,7 +488,9 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
const int8_t* Bp = B.PackedMat();
- int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+ //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+ int32_t* row_offsets
+ = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64));
int n_begin, n_end;
int t_begin, t_end, h_begin, h_end;
@@ -566,10 +568,11 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_(
} // h
} // t
} // for each n
+ FREE(row_offsets);
};
template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE>
-static inline __attribute__((always_inline)) void
+static inline ALWAYS_INLINE void
depthwise_3x3x3_per_channel_quantization_pad_1_(
int N,
int T,
@@ -601,7 +604,9 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
const int8_t* Bp = B.PackedMat();
- int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+ //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64)));
+ int32_t* row_offsets
+ = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64));
int n_begin, n_end;
int t_begin, t_end, h_begin, h_end;
@@ -679,6 +684,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_(
} // h
} // t
} // for each n
+ FREE(row_offsets);
};
// Dispatch A_SYMMETRIC and B_SYMMETRIC
@@ -704,7 +710,7 @@ static void depthwise_3x3x3_pad_1_(
float act_times_w_scale,
int thread_id,
int num_threads) {
- int32_t C_int32_temp[(K + 31) / 32 * 32];
+ int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32];
if (A_zero_point == 0 || col_offsets == nullptr) {
if (B_zero_point == 0) {
depthwise_3x3x3_pad_1_<
@@ -822,6 +828,7 @@ static void depthwise_3x3x3_pad_1_(
num_threads);
}
}
+ delete[] C_int32_temp;
}
// Dispatch HAS_BIAS
@@ -1004,7 +1011,7 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
const float* act_times_w_scale,
int thread_id,
int num_threads) {
- int32_t C_int32_temp[(K + 31) / 32 * 32];
+ int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32];
if (A_zero_point == 0 || col_offsets == nullptr) {
depthwise_3x3x3_per_channel_quantization_pad_1_<
FUSE_RELU,
@@ -1060,6 +1067,7 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_(
thread_id,
num_threads);
}
+ delete[] C_int32_temp;
}
// Dispatch HAS_BIAS
diff --git a/src/FbgemmI8DepthwiseAvx2-inl.h b/src/FbgemmI8DepthwiseAvx2-inl.h
index 7ad39fc..aee9ab3 100644
--- a/src/FbgemmI8DepthwiseAvx2-inl.h
+++ b/src/FbgemmI8DepthwiseAvx2-inl.h
@@ -13,6 +13,7 @@
#include <type_traits> // for is_same
#include <immintrin.h>
+#include "fbgemm/Utils.h"
namespace fbgemm {
@@ -40,7 +41,7 @@ static int masks[8][8] = {
// c2_v: c[8:12], c[24:28]
// c3_v: c[12:16], c[28:32]
template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16x4_packed(
+static inline ALWAYS_INLINE void madd_epi16x4_packed(
__m256i a0_v,
__m256i a1_v,
__m256i a2_v,
@@ -99,7 +100,7 @@ static inline __attribute__((always_inline)) void madd_epi16x4_packed(
// c2_v: c[8:12], c[24:28]
// c3_v: c[12:16], c[28:32]
template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16x3_packed(
+static inline ALWAYS_INLINE void madd_epi16x3_packed(
__m256i a0_v,
__m256i a1_v,
__m256i a2_v,
@@ -159,7 +160,7 @@ static inline __attribute__((always_inline)) void madd_epi16x3_packed(
// c2_v: c[16:20], c[20:24]
// c3_v: c[24:28], c[28:32]
template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16x2_packed(
+static inline ALWAYS_INLINE void madd_epi16x2_packed(
__m256i a0_v,
__m256i a1_v,
const __m256i* b,
@@ -200,7 +201,7 @@ static inline __attribute__((always_inline)) void madd_epi16x2_packed(
// c2_v: c[16:20], c[20:24]
// c3_v: c[24:28], c[28:32]
template <bool SUM_A = false>
-static inline __attribute__((always_inline)) void madd_epi16_packed(
+static inline ALWAYS_INLINE void madd_epi16_packed(
__m256i a_v,
const __m256i* b,
__m256i* c0_v,
@@ -235,7 +236,7 @@ static inline __attribute__((always_inline)) void madd_epi16_packed(
// K is the number of accumulations we're doing
template <int K, bool SUM_A = false, bool REMAINDER = false, bool ACC = false>
-static inline __attribute__((always_inline)) void inner_prod_packed_(
+static inline ALWAYS_INLINE void inner_prod_packed_(
const __m256i* a_v,
const __m256i* Bp,
std::int32_t* C,
@@ -383,7 +384,7 @@ template <
bool A_SYMMETRIC,
bool B_SYMMETRIC,
typename BIAS_TYPE>
-static inline __attribute__((always_inline)) void requantize_(
+static inline ALWAYS_INLINE void requantize_(
std::int32_t A_zero_point,
const float* C_multiplier,
std::int32_t C_zero_point,
@@ -688,7 +689,7 @@ static inline __attribute__((always_inline)) void requantize_(
}
template <bool REMAINDER>
-static inline __attribute__((always_inline)) __m256i load_a(
+static inline ALWAYS_INLINE __m256i load_a(
const std::uint8_t* A,
__m256i mask_v) {
if (REMAINDER) {
diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc
index 0e17bcd..ab2e1f2 100644
--- a/src/PackDepthwiseConvMatrixAvx2.cc
+++ b/src/PackDepthwiseConvMatrixAvx2.cc
@@ -5,6 +5,8 @@
* LICENSE file in the root directory of this source tree.
*/
#include "fbgemm/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/Utils.h"
+#include "fbgemm/Fbgemm.h"
#include <immintrin.h>
@@ -33,7 +35,9 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
const int8_t* smat)
: K_(K), kernel_prod_(kernel_prod) {
// Transpose the input matrix to make packing faster.
- alignas(64) int8_t smat_transposed[K * kernel_prod];
+ int8_t* smat_transposed
+ = static_cast<int8_t*>(ALIGNED_MALLOC(K * kernel_prod * sizeof(int8_t), 64));
+
for (int i = 0; i < kernel_prod; ++i) {
for (int j = 0; j < K; ++j) {
smat_transposed[i * K + j] = smat[i + j * kernel_prod];
@@ -42,12 +46,11 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
// Allocate packed arrays
int kernel_prod_aligned = (kernel_prod + 1) / 2 * 2;
- // pmat_ = static_cast<int8_t *>(fbgemmAlignedAlloc(
- // 64, ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t)));
- posix_memalign(
- (void**)&pmat_,
- 64,
- ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t));
+ pmat_ = static_cast<int8_t *>(fbgemmAlignedAlloc(64, ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t)));
+ //posix_memalign(
+ // (void**)&pmat_,
+ // 64,
+ // ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t));
// Pack input matrix
// The layout is optimized to use vpmaddubsw efficiently (see
@@ -102,7 +105,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
// (12, 8), (12, 9), (12, 10), zero, ..., (15, 8), (15, 9), (15, 10), zero
// (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero
for (int k1 = 0; k1 < K; k1 += 32) {
- __m256i b_v[kernel_prod];
+ __m256i* b_v = new __m256i[kernel_prod];
int remainder = K - k1;
if (remainder < 32) {
__m256i mask_v = _mm256_loadu_si256(
@@ -119,7 +122,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
}
// Interleave 2 SIMD registers
- __m256i b_interleaved_epi16[kernel_prod_aligned];
+ __m256i* b_interleaved_epi16 = new __m256i[kernel_prod_aligned];
__m256i zero_v = _mm256_setzero_si256();
for (int i = 0; i < kernel_prod_aligned / 2; ++i) {
if (2 * i + 1 >= kernel_prod) {
@@ -135,7 +138,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
}
// Interleave 4 SIMD registers
- __m256i b_interleaved_epi32[kernel_prod_aligned];
+ __m256i* b_interleaved_epi32 = new __m256i[kernel_prod_aligned];
for (int i = 0; i < kernel_prod_aligned / 4; ++i) {
b_interleaved_epi32[4 * i] = _mm256_unpacklo_epi16(
b_interleaved_epi16[4 * i], b_interleaved_epi16[4 * i + 2]);
@@ -156,7 +159,12 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix(
&pmat_[((k1 / 32) * kernel_prod_aligned + i) * 32]),
b_interleaved_epi32[i]);
}
+
+ delete[] b_v;
+ delete[] b_interleaved_epi16;
+ delete[] b_interleaved_epi32;
}
+ FREE(smat_transposed);
}
int PackedDepthWiseConvMatrix::addr(int r, int c) {