diff options
author | Young Jin Kim <youki@microsoft.com> | 2019-09-25 19:43:01 +0300 |
---|---|---|
committer | Young Jin Kim <youki@microsoft.com> | 2019-09-25 19:43:01 +0300 |
commit | d02815ffedbc46a3f8af1a3884efefd83668a401 (patch) | |
tree | c48612db7a801b26b1a966b00fea3d35b17e23ce | |
parent | 7bd598c9e97871e42c19449fddf7bd317898eb58 (diff) |
Fix windows build errors
-rw-r--r-- | include/fbgemm/Utils.h | 2 | ||||
-rw-r--r-- | src/FbgemmI8Depthwise3DAvx2.cc | 26 | ||||
-rw-r--r-- | src/FbgemmI8DepthwiseAvx2-inl.h | 15 | ||||
-rw-r--r-- | src/PackDepthwiseConvMatrixAvx2.cc | 28 |
4 files changed, 44 insertions, 27 deletions
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h index 3976790..7cb86d4 100644 --- a/include/fbgemm/Utils.h +++ b/include/fbgemm/Utils.h @@ -16,7 +16,7 @@ # define ALIGNED_MALLOC(size, alignment) _aligned_malloc(size, alignment) # define FREE(ptr) _aligned_free(ptr) #else -# define ALWAYS_INLINE __attribute__((always_inline)) +# define ALWAYS_INLINE ALWAYS_INLINE # define ALIGNED_MALLOC(size, alignment) aligned_alloc(alignment, size) # define FREE(ptr) free(ptr) #endif diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc index 925d265..2114b20 100644 --- a/src/FbgemmI8Depthwise3DAvx2.cc +++ b/src/FbgemmI8Depthwise3DAvx2.cc @@ -19,7 +19,7 @@ template < bool SUM_A, bool REMAINDER = false, bool PER_CHANNEL_QUANTIZATION = false> -static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_( +static inline ALWAYS_INLINE void inner_prod_3x3x3_packed_( int T, int H, int W, @@ -272,7 +272,7 @@ template < bool A_SYMMETRIC, bool B_SYMMETRIC, typename BIAS_TYPE> -static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_( +static inline ALWAYS_INLINE void depthwise_3x3x3_kernel_( int T, int H, int W, @@ -359,7 +359,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_( } template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE> -static inline __attribute__((always_inline)) void +static inline ALWAYS_INLINE void depthwise_3x3x3_per_channel_quantization_kernel_( int T, int H, @@ -457,7 +457,7 @@ template < bool A_SYMMETRIC, bool B_SYMMETRIC, typename BIAS_TYPE> -static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_( +static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_( int N, int T, int H, @@ -488,7 +488,9 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_( int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + int32_t* row_offsets + = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64)); int n_begin, n_end; int t_begin, t_end, h_begin, h_end; @@ -566,10 +568,11 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_( } // h } // t } // for each n + FREE(row_offsets); }; template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, typename BIAS_TYPE> -static inline __attribute__((always_inline)) void +static inline ALWAYS_INLINE void depthwise_3x3x3_per_channel_quantization_pad_1_( int N, int T, @@ -601,7 +604,9 @@ depthwise_3x3x3_per_channel_quantization_pad_1_( int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + //int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + int32_t* row_offsets + = static_cast<int32_t*>(ALIGNED_MALLOC((K + 31) / 32 * 32 * sizeof(int32_t), 64)); int n_begin, n_end; int t_begin, t_end, h_begin, h_end; @@ -679,6 +684,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_( } // h } // t } // for each n + FREE(row_offsets); }; // Dispatch A_SYMMETRIC and B_SYMMETRIC @@ -704,7 +710,7 @@ static void depthwise_3x3x3_pad_1_( float act_times_w_scale, int thread_id, int num_threads) { - int32_t C_int32_temp[(K + 31) / 32 * 32]; + int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32]; if (A_zero_point == 0 || col_offsets == nullptr) { if (B_zero_point == 0) { depthwise_3x3x3_pad_1_< @@ -822,6 +828,7 @@ static void depthwise_3x3x3_pad_1_( num_threads); } } + delete[] C_int32_temp; } // Dispatch HAS_BIAS @@ -1004,7 +1011,7 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_( const float* act_times_w_scale, int thread_id, int num_threads) { - int32_t C_int32_temp[(K + 31) / 32 * 32]; + int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32]; if (A_zero_point == 0 || col_offsets == nullptr) { depthwise_3x3x3_per_channel_quantization_pad_1_< FUSE_RELU, @@ -1060,6 +1067,7 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_( thread_id, num_threads); } + delete[] C_int32_temp; } // Dispatch HAS_BIAS diff --git a/src/FbgemmI8DepthwiseAvx2-inl.h b/src/FbgemmI8DepthwiseAvx2-inl.h index 7ad39fc..aee9ab3 100644 --- a/src/FbgemmI8DepthwiseAvx2-inl.h +++ b/src/FbgemmI8DepthwiseAvx2-inl.h @@ -13,6 +13,7 @@ #include <type_traits> // for is_same #include <immintrin.h> +#include "fbgemm/Utils.h" namespace fbgemm { @@ -40,7 +41,7 @@ static int masks[8][8] = { // c2_v: c[8:12], c[24:28] // c3_v: c[12:16], c[28:32] template <bool SUM_A = false> -static inline __attribute__((always_inline)) void madd_epi16x4_packed( +static inline ALWAYS_INLINE void madd_epi16x4_packed( __m256i a0_v, __m256i a1_v, __m256i a2_v, @@ -99,7 +100,7 @@ static inline __attribute__((always_inline)) void madd_epi16x4_packed( // c2_v: c[8:12], c[24:28] // c3_v: c[12:16], c[28:32] template <bool SUM_A = false> -static inline __attribute__((always_inline)) void madd_epi16x3_packed( +static inline ALWAYS_INLINE void madd_epi16x3_packed( __m256i a0_v, __m256i a1_v, __m256i a2_v, @@ -159,7 +160,7 @@ static inline __attribute__((always_inline)) void madd_epi16x3_packed( // c2_v: c[16:20], c[20:24] // c3_v: c[24:28], c[28:32] template <bool SUM_A = false> -static inline __attribute__((always_inline)) void madd_epi16x2_packed( +static inline ALWAYS_INLINE void madd_epi16x2_packed( __m256i a0_v, __m256i a1_v, const __m256i* b, @@ -200,7 +201,7 @@ static inline __attribute__((always_inline)) void madd_epi16x2_packed( // c2_v: c[16:20], c[20:24] // c3_v: c[24:28], c[28:32] template <bool SUM_A = false> -static inline __attribute__((always_inline)) void madd_epi16_packed( +static inline ALWAYS_INLINE void madd_epi16_packed( __m256i a_v, const __m256i* b, __m256i* c0_v, @@ -235,7 +236,7 @@ static inline __attribute__((always_inline)) void madd_epi16_packed( // K is the number of accumulations we're doing template <int K, bool SUM_A = false, bool REMAINDER = false, bool ACC = false> -static inline __attribute__((always_inline)) void inner_prod_packed_( +static inline ALWAYS_INLINE void inner_prod_packed_( const __m256i* a_v, const __m256i* Bp, std::int32_t* C, @@ -383,7 +384,7 @@ template < bool A_SYMMETRIC, bool B_SYMMETRIC, typename BIAS_TYPE> -static inline __attribute__((always_inline)) void requantize_( +static inline ALWAYS_INLINE void requantize_( std::int32_t A_zero_point, const float* C_multiplier, std::int32_t C_zero_point, @@ -688,7 +689,7 @@ static inline __attribute__((always_inline)) void requantize_( } template <bool REMAINDER> -static inline __attribute__((always_inline)) __m256i load_a( +static inline ALWAYS_INLINE __m256i load_a( const std::uint8_t* A, __m256i mask_v) { if (REMAINDER) { diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc index 0e17bcd..ab2e1f2 100644 --- a/src/PackDepthwiseConvMatrixAvx2.cc +++ b/src/PackDepthwiseConvMatrixAvx2.cc @@ -5,6 +5,8 @@ * LICENSE file in the root directory of this source tree. */ #include "fbgemm/FbgemmI8DepthwiseAvx2.h" +#include "fbgemm/Utils.h" +#include "fbgemm/Fbgemm.h" #include <immintrin.h> @@ -33,7 +35,9 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( const int8_t* smat) : K_(K), kernel_prod_(kernel_prod) { // Transpose the input matrix to make packing faster. - alignas(64) int8_t smat_transposed[K * kernel_prod]; + int8_t* smat_transposed + = static_cast<int8_t*>(ALIGNED_MALLOC(K * kernel_prod * sizeof(int8_t), 64)); + for (int i = 0; i < kernel_prod; ++i) { for (int j = 0; j < K; ++j) { smat_transposed[i * K + j] = smat[i + j * kernel_prod]; @@ -42,12 +46,11 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( // Allocate packed arrays int kernel_prod_aligned = (kernel_prod + 1) / 2 * 2; - // pmat_ = static_cast<int8_t *>(fbgemmAlignedAlloc( - // 64, ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t))); - posix_memalign( - (void**)&pmat_, - 64, - ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t)); + pmat_ = static_cast<int8_t *>(fbgemmAlignedAlloc(64, ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t))); + //posix_memalign( + // (void**)&pmat_, + // 64, + // ((K + 31) / 32) * kernel_prod_aligned * 32 * sizeof(int8_t)); // Pack input matrix // The layout is optimized to use vpmaddubsw efficiently (see @@ -102,7 +105,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( // (12, 8), (12, 9), (12, 10), zero, ..., (15, 8), (15, 9), (15, 10), zero // (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero for (int k1 = 0; k1 < K; k1 += 32) { - __m256i b_v[kernel_prod]; + __m256i* b_v = new __m256i[kernel_prod]; int remainder = K - k1; if (remainder < 32) { __m256i mask_v = _mm256_loadu_si256( @@ -119,7 +122,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( } // Interleave 2 SIMD registers - __m256i b_interleaved_epi16[kernel_prod_aligned]; + __m256i* b_interleaved_epi16 = new __m256i[kernel_prod_aligned]; __m256i zero_v = _mm256_setzero_si256(); for (int i = 0; i < kernel_prod_aligned / 2; ++i) { if (2 * i + 1 >= kernel_prod) { @@ -135,7 +138,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( } // Interleave 4 SIMD registers - __m256i b_interleaved_epi32[kernel_prod_aligned]; + __m256i* b_interleaved_epi32 = new __m256i[kernel_prod_aligned]; for (int i = 0; i < kernel_prod_aligned / 4; ++i) { b_interleaved_epi32[4 * i] = _mm256_unpacklo_epi16( b_interleaved_epi16[4 * i], b_interleaved_epi16[4 * i + 2]); @@ -156,7 +159,12 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( &pmat_[((k1 / 32) * kernel_prod_aligned + i) * 32]), b_interleaved_epi32[i]); } + + delete[] b_v; + delete[] b_interleaved_epi16; + delete[] b_interleaved_epi32; } + FREE(smat_transposed); } int PackedDepthWiseConvMatrix::addr(int r, int c) { |