diff options
Diffstat (limited to 'src/FbgemmI8DepthwiseAvx2.cc')
-rw-r--r-- | src/FbgemmI8DepthwiseAvx2.cc | 80 |
1 files changed, 51 insertions, 29 deletions
diff --git a/src/FbgemmI8DepthwiseAvx2.cc b/src/FbgemmI8DepthwiseAvx2.cc index ee39faf..f96d1d2 100644 --- a/src/FbgemmI8DepthwiseAvx2.cc +++ b/src/FbgemmI8DepthwiseAvx2.cc @@ -5,6 +5,7 @@ * LICENSE file in the root directory of this source tree. */ #include "fbgemm/FbgemmI8DepthwiseAvx2.h" +#include "fbgemm/Utils.h" #include <algorithm> // for min and max #include <cassert> @@ -36,7 +37,8 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix( const int8_t* smat) : K_(K) { // Transpose the input matrix to make packing faster. - alignas(64) int8_t smat_transposed[K * KERNEL_PROD]; + int8_t* smat_transposed = static_cast<int8_t *>(ALIGNED_MALLOC( + K * KERNEL_PROD * sizeof(int8_t), 64)); for (int i = 0; i < KERNEL_PROD; ++i) { for (int j = 0; j < K; ++j) { smat_transposed[i * K + j] = smat[i + j * KERNEL_PROD]; @@ -45,12 +47,15 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix( // Allocate packed arrays constexpr int KERNEL_PROD_ALIGNED = (KERNEL_PROD + 1) / 2 * 2; - // pmat_ = static_cast<int8_t *>(fbgemmAlignedAlloc( - // 64, ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t))); +#ifdef _MSC_VER + pmat_ = static_cast<int8_t *>(_aligned_malloc( + ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t), 64)); +#else posix_memalign( (void**)&pmat_, 64, ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t)); +#endif // Pack input matrix // The layout is optimized to use vpmaddubsw efficiently (see @@ -160,11 +165,17 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix( b_interleaved_epi32[i]); } } + + FREE(smat_transposed); } template <int KERNEL_PROD> PackedDepthWiseConvMatrix<KERNEL_PROD>::~PackedDepthWiseConvMatrix() { +#ifdef _MSC_VER + _aligned_free(pmat_); +#else free(pmat_); +#endif } template class PackedDepthWiseConvMatrix<3 * 3>; @@ -179,7 +190,7 @@ template class PackedDepthWiseConvMatrix<3 * 3 * 3>; // c2_v: c[8:12], c[24:28] // c3_v: c[12:16], c[28:32] template <bool SUM_A = false> -static inline __attribute__((always_inline)) void madd_epi16x4_packed( +static inline ALWAYS_INLINE void madd_epi16x4_packed( __m256i a0_v, __m256i a1_v, __m256i a2_v, @@ -238,7 +249,7 @@ static inline __attribute__((always_inline)) void madd_epi16x4_packed( // c2_v: c[8:12], c[24:28] // c3_v: c[12:16], c[28:32] template <bool SUM_A = false> -static inline __attribute__((always_inline)) void madd_epi16x3_packed( +static inline ALWAYS_INLINE void madd_epi16x3_packed( __m256i a0_v, __m256i a1_v, __m256i a2_v, @@ -298,7 +309,7 @@ static inline __attribute__((always_inline)) void madd_epi16x3_packed( // c2_v: c[16:20], c[20:24] // c3_v: c[24:28], c[28:32] template <bool SUM_A = false> -static inline __attribute__((always_inline)) void madd_epi16x2_packed( +static inline ALWAYS_INLINE void madd_epi16x2_packed( __m256i a0_v, __m256i a1_v, const __m256i* b, @@ -339,7 +350,7 @@ static inline __attribute__((always_inline)) void madd_epi16x2_packed( // c2_v: c[16:20], c[20:24] // c3_v: c[24:28], c[28:32] template <bool SUM_A = false> -static inline __attribute__((always_inline)) void madd_epi16_packed( +static inline ALWAYS_INLINE void madd_epi16_packed( __m256i a_v, const __m256i* b, __m256i* c0_v, @@ -374,7 +385,7 @@ static inline __attribute__((always_inline)) void madd_epi16_packed( // K is the number of accumulations we're doing template <int K, bool SUM_A = false, bool REMAINDER = false, bool ACC = false> -static inline __attribute__((always_inline)) void inner_prod_packed_( +static inline ALWAYS_INLINE void inner_prod_packed_( const __m256i* a_v, const __m256i* Bp, int32_t* C, @@ -514,7 +525,7 @@ static inline __attribute__((always_inline)) void inner_prod_packed_( } template <bool SUM_A = false, bool REMAINDER = false> -static inline __attribute__((always_inline)) void inner_prod_3x3_packed_( +static inline ALWAYS_INLINE void inner_prod_3x3_packed_( const __m256i* a_v, const __m256i* Bp, int32_t* C, @@ -531,7 +542,7 @@ template < bool PER_CHANNEL_QUANTIZATION, bool A_SYMMETRIC, bool B_SYMMETRIC> -static inline __attribute__((always_inline)) void requantize_( +static inline ALWAYS_INLINE void requantize_( int32_t A_zero_point, const float* C_multiplier, int32_t C_zero_point, @@ -745,7 +756,7 @@ static inline __attribute__((always_inline)) void requantize_( } template <bool REMAINDER> -static inline __attribute__((always_inline)) __m256i load_a( +static inline ALWAYS_INLINE __m256i load_a( const uint8_t* A, __m256i mask_v) { if (REMAINDER) { @@ -759,7 +770,7 @@ template < bool SUM_A, bool REMAINDER = false, bool PER_CHANNEL_QUANTIZATION = false> -static inline __attribute__((always_inline)) void inner_prod_3x3_packed_( +static inline ALWAYS_INLINE void inner_prod_3x3_packed_( int H, int W, int K, @@ -870,7 +881,7 @@ template < bool SUM_A, bool REMAINDER = false, bool PER_CHANNEL_QUANTIZATION = false> -static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_( +static inline ALWAYS_INLINE void inner_prod_3x3x3_packed_( int T, int H, int W, @@ -1118,7 +1129,7 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_( } template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC> -static inline __attribute__((always_inline)) void depthwise_3x3_kernel_( +static inline ALWAYS_INLINE void depthwise_3x3_kernel_( int H, int W, int K, @@ -1194,7 +1205,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3_kernel_( } template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC> -static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_( +static inline ALWAYS_INLINE void depthwise_3x3x3_kernel_( int T, int H, int W, @@ -1279,7 +1290,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_( } template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC> -static inline __attribute__((always_inline)) void +static inline ALWAYS_INLINE void depthwise_3x3_per_channel_quantization_kernel_( int H, int W, @@ -1362,7 +1373,7 @@ depthwise_3x3_per_channel_quantization_kernel_( } template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC> -static inline __attribute__((always_inline)) void +static inline ALWAYS_INLINE void depthwise_3x3x3_per_channel_quantization_kernel_( int T, int H, @@ -1465,7 +1476,7 @@ static pair<int, int> closest_factors_(int n) { // filter shapes by parameterizing with R and S but restricting it to just 3x3 // for now. template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC> -static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_( +static inline ALWAYS_INLINE void depthwise_3x3_pad_1_( int N, int H, int W, @@ -1491,7 +1502,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_( int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + int32_t* row_offsets = static_cast<int32_t *>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); int n_begin, n_end; int h_begin, h_end, w_begin, w_end; @@ -1748,10 +1759,11 @@ static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_( } } } // for each n + FREE(row_offsets); }; template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC, bool B_SYMMETRIC> -static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_( +static inline ALWAYS_INLINE void depthwise_3x3x3_pad_1_( int N, int T, int H, @@ -1781,7 +1793,7 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_( int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + int32_t* row_offsets = static_cast<int32_t*>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64))); int n_begin, n_end; int t_begin, t_end, h_begin, h_end; @@ -1858,10 +1870,12 @@ static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_( } // h } // t } // for each n + + FREE(row_offsets); }; template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC> -static inline __attribute__((always_inline)) void +static inline ALWAYS_INLINE void depthwise_3x3_per_channel_quantization_pad_1_( int N, int H, @@ -1888,7 +1902,7 @@ depthwise_3x3_per_channel_quantization_pad_1_( int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + int32_t* row_offsets = static_cast<int32_t*>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64))); int n_begin, n_end; int h_begin, h_end, w_begin, w_end; @@ -2172,10 +2186,12 @@ depthwise_3x3_per_channel_quantization_pad_1_( } } } // for each n + + FREE(row_offsets); }; template <bool FUSE_RELU, bool HAS_BIAS, bool A_SYMMETRIC> -static inline __attribute__((always_inline)) void +static inline ALWAYS_INLINE void depthwise_3x3x3_per_channel_quantization_pad_1_( int N, int T, @@ -2206,7 +2222,7 @@ depthwise_3x3x3_per_channel_quantization_pad_1_( int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + int32_t* row_offsets = static_cast<int32_t*>(ALIGNED_MALLOC(((K + 31) / 32 * 32)*sizeof(int32_t), 64)); // __attribute__((aligned(64))); int n_begin, n_end; int t_begin, t_end, h_begin, h_end; @@ -2282,6 +2298,8 @@ depthwise_3x3x3_per_channel_quantization_pad_1_( } // h } // t } // for each n + + FREE(row_offsets); }; // Dispatch A_SYMMETRIC and B_SYMMETRIC @@ -2304,7 +2322,7 @@ static void depthwise_3x3_pad_1_( const int32_t* bias, int thread_id, int num_threads) { - int32_t C_int32_temp[(K + 31) / 32 * 32]; + int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32]; if (A_zero_point == 0 || col_offsets == nullptr) { if (B_zero_point == 0) { depthwise_3x3_pad_1_< @@ -2406,6 +2424,7 @@ static void depthwise_3x3_pad_1_( num_threads); } } + delete[] C_int32_temp; } // Dispatch HAS_BIAS @@ -2709,7 +2728,7 @@ static void depthwise_3x3x3_pad_1_( const int32_t* bias, int thread_id, int num_threads) { - int32_t C_int32_temp[(K + 31) / 32 * 32]; + int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32]; if (A_zero_point == 0 || col_offsets == nullptr) { if (B_zero_point == 0) { depthwise_3x3x3_pad_1_< @@ -2819,6 +2838,7 @@ static void depthwise_3x3x3_pad_1_( num_threads); } } + delete[] C_int32_temp; } // Dispatch HAS_BIAS @@ -2975,7 +2995,7 @@ static void depthwise_3x3_per_channel_quantization_pad_1_( const int32_t* bias, int thread_id, int num_threads) { - int32_t C_int32_temp[(K + 31) / 32 * 32]; + int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32]; if (A_zero_point == 0 || col_offsets == nullptr) { depthwise_3x3_per_channel_quantization_pad_1_< FUSE_RELU, @@ -3023,6 +3043,7 @@ static void depthwise_3x3_per_channel_quantization_pad_1_( thread_id, num_threads); } + delete[] C_int32_temp; } // Dispatch HAS_BIAS @@ -3329,7 +3350,7 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_( const int32_t* bias, int thread_id, int num_threads) { - int32_t C_int32_temp[(K + 31) / 32 * 32]; + int32_t* C_int32_temp = new int32_t[(K + 31) / 32 * 32]; if (A_zero_point == 0 || col_offsets == nullptr) { depthwise_3x3x3_per_channel_quantization_pad_1_< FUSE_RELU, @@ -3381,6 +3402,7 @@ static void depthwise_3x3x3_per_channel_quantization_pad_1_( thread_id, num_threads); } + delete[] C_int32_temp; } // Dispatch HAS_BIAS |