From 690dbc29d9b0cb373fa0303b7c30c20b527e9605 Mon Sep 17 00:00:00 2001 From: dskhudia Date: Sun, 4 Nov 2018 08:22:37 -0800 Subject: Syncing with internal version. Fixes for Mac/clang build. Other minor fixes --- CMakeLists.txt | 1 + include/fbgemm/Fbgemm.h | 29 +- include/fbgemm/FbgemmFP16.h | 129 +- src/FbgemmFP16.cc | 10 +- src/FbgemmI8Depthwise.cc | 1893 +++++++++++++++++++++--------- src/GenerateKernelU8S8S32ACC16.cc | 26 +- src/GenerateKernelU8S8S32ACC16_avx512.cc | 28 +- src/GenerateKernelU8S8S32ACC32.cc | 27 +- src/GenerateKernelU8S8S32ACC32_avx512.cc | 29 +- src/PackAMatrix.cc | 6 +- src/PackAWithIm2Col.cc | 5 +- src/PackBMatrix.cc | 4 +- src/PackWithQuantRowOffset.cc | 46 +- src/PackWithRowOffset.cc | 33 +- src/RefImplementations.cc | 3 +- 15 files changed, 1590 insertions(+), 679 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a477d6..ad8ffd9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,7 @@ if(NOT TARGET asmjit) #build asmjit set(ASMJIT_STATIC ON) add_subdirectory("${ASMJIT_SRC_DIR}" "${FBGEMM_BINARY_DIR}/asmjit") + set_property(TARGET asmjit PROPERTY POSITION_INDEPENDENT_CODE ON) endif() if(NOT TARGET cpuinfo) diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index 988e24b..2f9ddc7 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -236,7 +236,7 @@ class PackMatrix { return last_bcol_ != blockColSize(); } - ~PackMatrix() { + virtual ~PackMatrix() { if (bufAllocatedHere_) { free(buf_); } @@ -286,7 +286,7 @@ class PackMatrix { * accumulation type is int32. */ template -class PackAMatrix : public PackMatrix, T, accT> { +class PackAMatrix final : public PackMatrix, T, accT> { public: using This = PackAMatrix; using BaseType = PackMatrix; @@ -306,7 +306,7 @@ class PackAMatrix : public PackMatrix, T, accT> { std::int32_t ld, inpType* pmat = nullptr, std::int32_t groups = 1, - accT zero_pt = 0); + std::int32_t zero_pt = 0); /** * Activation matrices are not constant so cannot amortize the cost of @@ -361,7 +361,7 @@ class PackAMatrix : public PackMatrix, T, accT> { * type is int32. */ template -class PackBMatrix : public PackMatrix, T, accT> { +class PackBMatrix final : public PackMatrix, T, accT> { public: using This = PackBMatrix; using BaseType = PackMatrix; @@ -381,7 +381,7 @@ class PackBMatrix : public PackMatrix, T, accT> { std::int32_t ld, inpType* pmat = nullptr, std::int32_t groups = 1, - accT zero_pt = 0); + std::int32_t zero_pt = 0); /** * Weight matrices are usually constant so worth pre-packing. @@ -439,7 +439,8 @@ class PackBMatrix : public PackMatrix, T, accT> { * quantized. */ template -class PackAWithIm2Col : public PackMatrix, T, accT> { +class PackAWithIm2Col final + : public PackMatrix, T, accT> { public: using This = PackAWithIm2Col; using BaseType = PackMatrix; @@ -499,7 +500,7 @@ class PackAWithIm2Col : public PackMatrix, T, accT> { * The source matrix is already quantized. */ template -class PackAWithRowOffset +class PackAWithRowOffset final : public PackMatrix, T, accT> { public: using This = PackAWithRowOffset; @@ -572,7 +573,7 @@ class PackAWithRowOffset * The source matrix is in fp32 and quantized during packing. */ template -class PackAWithQuantRowOffset +class PackAWithQuantRowOffset final : public PackMatrix, T, accT> { public: using This = PackAWithQuantRowOffset; @@ -935,7 +936,6 @@ void fbgemmPacked( /** * @brief Perform depthwise separable convolution */ - template < typename packingAMatrix, typename packingBMatrix, @@ -949,4 +949,15 @@ void convDepthwiseSeparable( outT* out, const processOutputType& output); +/** + * @brief Allocate __size bytes of uninitialized storage whose alignment is + * specified by __align. + */ +static void* fbgemmAlignedAlloc(size_t __align, size_t __size) { + void* aligned_mem; + if (posix_memalign(&aligned_mem, __align, __size)) + return 0; + return aligned_mem; +} + } // namespace fbgemm2 diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h index 55718d4..0428d93 100644 --- a/include/fbgemm/FbgemmFP16.h +++ b/include/fbgemm/FbgemmFP16.h @@ -10,6 +10,7 @@ // upgraded to match with new fbgemm interface. #include +#include #include #include @@ -22,7 +23,7 @@ namespace fbgemm2 { /// row-major format into /// internal packed blocked-row major format class PackedGemmMatrixFP16 { -public: + public: // takes smat input mamtrix in row-major format; // and packs it into gemm-friendly blocked format; // allocate space and sets up all the internal variables; @@ -32,30 +33,31 @@ public: // before flushing into fp32 // the smaller the brow_, the higher overhead // of flushing is - PackedGemmMatrixFP16(const matrix_op_t trans, const int nrow, - const int ncol, const float alpha, - const float *smat, - const int brow = 512) + PackedGemmMatrixFP16( + const matrix_op_t trans, + const int nrow, + const int ncol, + const float alpha, + const float* smat, + const int brow = 512) : nrow_(nrow), ncol_(ncol), brow_(brow) { - bcol_ = 8 * 1; // hardwired // set up internal packing parameters nbrow_ = ((numRows() % blockRowSize()) == 0) - ? (numRows() / blockRowSize()) - : ((numRows() + blockRowSize()) / blockRowSize()); + ? (numRows() / blockRowSize()) + : ((numRows() + blockRowSize()) / blockRowSize()); last_brow_ = ((nrow % blockRowSize()) == 0) ? blockRowSize() - : (nrow % blockRowSize()); + : (nrow % blockRowSize()); nbcol_ = ((numCols() % blockColSize()) == 0) - ? (numCols() / blockColSize()) - : ((numCols() + blockColSize()) / blockColSize()); + ? (numCols() / blockColSize()) + : ((numCols() + blockColSize()) / blockColSize()); if (numCols() != blockColSize() * nbcol_) { #ifdef VLOG - VLOG(0) - << "Packer warning: ncol(" << numCols() - << ") is not a multiple of internal block size (" << blockColSize() - << ")"; + VLOG(0) << "Packer warning: ncol(" << numCols() + << ") is not a multiple of internal block size (" + << blockColSize() << ")"; VLOG(0) << "lefover is currently done via MKL: hence overhead will inccur"; #endif @@ -64,7 +66,9 @@ public: // allocate and initialize packed memory const int padding = 1024; // required by sw pipelined kernels size_ = (blockRowSize() * nbrow_) * (blockColSize() * nbcol_); - pmat_ = (float16 *)aligned_alloc(64, matSize() * sizeof(float16) + padding); + // pmat_ = (float16 *)aligned_alloc(64, matSize() * sizeof(float16) + + // padding); + posix_memalign((void**)&pmat_, 64, matSize() * sizeof(float16) + padding); for (auto i = 0; i < matSize(); i++) { pmat_[i] = tconv(0.f, pmat_[i]); } @@ -77,7 +81,7 @@ public: free(pmat_); } -// protected: + // protected: // blocked row-major format address arithmetic uint64_t addr(const int r_, const int c_) const { uint64_t r = (uint64_t)r_; @@ -87,10 +91,9 @@ public: brow_offset = (block_row_id * nbcol_) * (blockRowSize() * blockColSize()); uint64_t block_col_id = c / blockColSize(), - bcol_offset = - block_col_id * ((block_row_id != nbrow_ - 1) - ? (blockRowSize() * blockColSize()) - : (last_brow_ * blockColSize())); + bcol_offset = block_col_id * + ((block_row_id != nbrow_ - 1) ? (blockRowSize() * blockColSize()) + : (last_brow_ * blockColSize())); uint64_t block_offset = brow_offset + bcol_offset; uint64_t inblock_offset = r % blockRowSize() * blockColSize() + c % blockColSize(); @@ -100,22 +103,22 @@ public: return index; } - void packFromSrc(const matrix_op_t trans, const float alpha, - const float *smat) { + void + packFromSrc(const matrix_op_t trans, const float alpha, const float* smat) { bool tr = (trans == matrix_op_t::Transpose); // pack for (int i = 0; i < numRows(); i++) { for (int j = 0; j < numCols(); j++) { pmat_[addr(i, j)] = tconv( - alpha * ( - (tr == false) - ? smat[i * numCols() + j] : smat[i + numRows() * j]), + alpha * + ((tr == false) ? smat[i * numCols() + j] + : smat[i + numRows() * j]), pmat_[addr(i, j)]); } } } - const float16 &operator()(const int r, const int c) const { + const float16& operator()(const int r, const int c) const { uint64_t a = addr(r, c); assert(r < numRows()); assert(c < numCols()); @@ -123,38 +126,60 @@ public: return pmat_[a]; } - int matSize() const { return size_; } - int numRows() const { return nrow_; } - int numCols() const { return ncol_; } - inline int blockRowSize() const { return brow_; } - inline int blockColSize() const { return bcol_; } + int matSize() const { + return size_; + } + int numRows() const { + return nrow_; + } + int numCols() const { + return ncol_; + } + inline int blockRowSize() const { + return brow_; + } + inline int blockColSize() const { + return bcol_; + } int nrow_, ncol_; int brow_, last_brow_, bcol_; int nbrow_, nbcol_; uint64_t size_; - float16 *pmat_; - - friend void cblas_gemm_compute(const matrix_op_t transa, const int m, - const float *A, - const PackedGemmMatrixFP16 &Bp, - const float beta, float *C); - friend void cblas_gemm_compute(const matrix_op_t transa, const int m, - const float *A, - const PackedGemmMatrixFP16 &Bp, - const float beta, float *C); + float16* pmat_; + + friend void cblas_gemm_compute( + const matrix_op_t transa, + const int m, + const float* A, + const PackedGemmMatrixFP16& Bp, + const float beta, + float* C); + friend void cblas_gemm_compute( + const matrix_op_t transa, + const int m, + const float* A, + const PackedGemmMatrixFP16& Bp, + const float beta, + float* C); }; /** * restrictions: transa == CblasNoTrans */ -extern void cblas_gemm_compute(const matrix_op_t transa, const int m, - const float *A, - const PackedGemmMatrixFP16 &Bp, - const float beta, float *C); -extern void cblas_gemm_compute(const matrix_op_t transa, const int m, - const float *A, - const PackedGemmMatrixFP16 &Bp, - const float beta, float *C); - -}; // namespace fbgemm +extern void cblas_gemm_compute( + const matrix_op_t transa, + const int m, + const float* A, + const PackedGemmMatrixFP16& Bp, + const float beta, + float* C); +extern void cblas_gemm_compute( + const matrix_op_t transa, + const int m, + const float* A, + const PackedGemmMatrixFP16& Bp, + const float beta, + float* C); + +}; // namespace fbgemm2 diff --git a/src/FbgemmFP16.cc b/src/FbgemmFP16.cc index 7bbfa54..eff173f 100644 --- a/src/FbgemmFP16.cc +++ b/src/FbgemmFP16.cc @@ -7,6 +7,8 @@ #include "fbgemm/FbgemmFP16.h" #include +#include +#include #include "FbgemmFP16UKernels.h" @@ -44,7 +46,7 @@ struct KernelInfo { // autotuned kernel splits for various cases m = 1:mb_max // may need re-autotuning for new uarch - static constexpr array, 2>, 121 > partition = { + static constexpr array, 2>, 121 > partition = { { {{ { 0, 0 }, { 0, 0 } } }, {{ { 1, 1 }, { 0, 0 } } }, @@ -171,7 +173,7 @@ struct KernelInfo { }; }; constexpr array KernelInfo::kernel; -constexpr array, 2>, 121 > KernelInfo::partition; +constexpr array, 2>, 121 > KernelInfo::partition; // autotuned kernel splits for various cases m = 1:mb_max void @@ -220,8 +222,8 @@ cblas_gemm_compute(const matrix_op_t transa, const int m, const float *A, auto m1 = 0; for (auto c = 0; c < 2; c++) { - auto kernel_nrows = KernelInfo::partition[mb][c].first; - auto nkernel_nrows = KernelInfo::partition[mb][c].second; + auto kernel_nrows = KernelInfo::partition[mb][c][0]; + auto nkernel_nrows = KernelInfo::partition[mb][c][1]; auto m_start = m1, m_end = m1 + kernel_nrows * nkernel_nrows; for (auto m2 = m_start; m2 < m_end; m2 += kernel_nrows) { diff --git a/src/FbgemmI8Depthwise.cc b/src/FbgemmI8Depthwise.cc index 54e2272..551e98e 100644 --- a/src/FbgemmI8Depthwise.cc +++ b/src/FbgemmI8Depthwise.cc @@ -18,8 +18,7 @@ using namespace std; -namespace fbgemm2 -{ +namespace fbgemm2 { static array, 8> masks = {{ { 0, 0, 0, 0, 0, 0, 0, 0, }, @@ -34,7 +33,8 @@ static array, 8> masks = {{ template PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( - int K, const int8_t *smat) + int K, + const int8_t* smat) : K_(K) { // Transpose the input matrix to make packing faster. vector smat_transposed(K * KERNEL_PROD); @@ -46,8 +46,12 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( // Allocate packed arrays constexpr int KERNEL_PROD_ALIGNED = (KERNEL_PROD + 1) / 2 * 2; - pmat_ = static_cast(aligned_alloc( - 64, ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t))); + // pmat_ = static_cast(fbgemmAlignedAlloc( + // 64, ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t))); + posix_memalign( + (void**)&pmat_, + 64, + ((K + 31) / 32) * KERNEL_PROD_ALIGNED * 32 * sizeof(int8_t)); // Pack input matrix // The layout is optimized to use vpmaddubsw efficiently (see @@ -106,15 +110,15 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( int remainder = K - k1; if (remainder < 32) { __m256i mask_v = _mm256_loadu_si256( - reinterpret_cast(masks[remainder / 4].data())); + reinterpret_cast(masks[remainder / 4].data())); for (int i = 0; i < KERNEL_PROD; ++i) { b_v[i] = _mm256_maskload_epi32( - reinterpret_cast(smat_transposed.data() + i * K + k1), + reinterpret_cast(smat_transposed.data() + i * K + k1), mask_v); } } else { for (int i = 0; i < KERNEL_PROD; ++i) { - b_v[i] = _mm256_lddqu_si256(reinterpret_cast( + b_v[i] = _mm256_lddqu_si256(reinterpret_cast( smat_transposed.data() + i * K + k1)); } } @@ -153,7 +157,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( for (int i = 0; i < KERNEL_PROD_ALIGNED; ++i) { _mm256_storeu_si256( - reinterpret_cast<__m256i *>( + reinterpret_cast<__m256i*>( &pmat_[((k1 / 32) * KERNEL_PROD_ALIGNED + i) * 32]), b_interleaved_epi32[i]); } @@ -161,8 +165,7 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( } template -PackedDepthWiseConvMatrix::~PackedDepthWiseConvMatrix() -{ +PackedDepthWiseConvMatrix::~PackedDepthWiseConvMatrix() { free(pmat_); } @@ -178,11 +181,16 @@ template class PackedDepthWiseConvMatrix<3 * 3 * 3>; // c2_v: c[8:12], c[24:28] // c3_v: c[12:16], c[28:32] template -static inline __attribute__((always_inline)) -void madd_epi16x4_packed( - __m256i a0_v, __m256i a1_v, __m256i a2_v, __m256i a3_v, +static inline __attribute__((always_inline)) void madd_epi16x4_packed( + __m256i a0_v, + __m256i a1_v, + __m256i a2_v, + __m256i a3_v, const __m256i* b, - __m256i* c0_v, __m256i* c1_v, __m256i* c2_v, __m256i* c3_v, + __m256i* c0_v, + __m256i* c1_v, + __m256i* c2_v, + __m256i* c3_v, __m256i* a_sum = nullptr) { __m256i a01_lo_v = _mm256_unpacklo_epi8(a0_v, a1_v); __m256i a01_hi_v = _mm256_unpackhi_epi8(a0_v, a1_v); @@ -232,11 +240,15 @@ void madd_epi16x4_packed( // c2_v: c[8:12], c[24:28] // c3_v: c[12:16], c[28:32] template -static inline __attribute__((always_inline)) -void madd_epi16x3_packed( - __m256i a0_v, __m256i a1_v, __m256i a2_v, +static inline __attribute__((always_inline)) void madd_epi16x3_packed( + __m256i a0_v, + __m256i a1_v, + __m256i a2_v, const __m256i* b, - __m256i* c0_v, __m256i* c1_v, __m256i* c2_v, __m256i* c3_v, + __m256i* c0_v, + __m256i* c1_v, + __m256i* c2_v, + __m256i* c3_v, __m256i* a_sum = nullptr) { __m256i zero_v = _mm256_setzero_si256(); @@ -288,10 +300,15 @@ void madd_epi16x3_packed( // c2_v: c[16:20], c[20:24] // c3_v: c[24:28], c[28:32] template -static inline __attribute__((always_inline)) void -madd_epi16x2_packed(__m256i a0_v, __m256i a1_v, const __m256i *b, __m256i *c0_v, - __m256i *c1_v, __m256i *c2_v, __m256i *c3_v, - __m256i *a_sum = nullptr) { +static inline __attribute__((always_inline)) void madd_epi16x2_packed( + __m256i a0_v, + __m256i a1_v, + const __m256i* b, + __m256i* c0_v, + __m256i* c1_v, + __m256i* c2_v, + __m256i* c3_v, + __m256i* a_sum = nullptr) { __m256i a_lo_v = _mm256_unpacklo_epi8(a0_v, a1_v); __m256i a_hi_v = _mm256_unpackhi_epi8(a0_v, a1_v); @@ -324,9 +341,14 @@ madd_epi16x2_packed(__m256i a0_v, __m256i a1_v, const __m256i *b, __m256i *c0_v, // c2_v: c[16:20], c[20:24] // c3_v: c[24:28], c[28:32] template -static inline __attribute__((always_inline)) void -madd_epi16_packed(__m256i a_v, const __m256i *b, __m256i *c0_v, __m256i *c1_v, - __m256i *c2_v, __m256i *c3_v, __m256i *a_sum = nullptr) { +static inline __attribute__((always_inline)) void madd_epi16_packed( + __m256i a_v, + const __m256i* b, + __m256i* c0_v, + __m256i* c1_v, + __m256i* c2_v, + __m256i* c3_v, + __m256i* a_sum = nullptr) { __m256i zero_v = _mm256_setzero_si256(); __m256i a_lo_v = _mm256_unpacklo_epi8(a_v, zero_v); @@ -354,21 +376,41 @@ madd_epi16_packed(__m256i a_v, const __m256i *b, __m256i *c0_v, __m256i *c1_v, // K is the number of accumulations we're doing template -static inline __attribute__((always_inline)) void -inner_prod_packed_(const __m256i *a_v, const __m256i *Bp, int32_t *C, - int remainder, __m256i *a_sum = nullptr) { +static inline __attribute__((always_inline)) void inner_prod_packed_( + const __m256i* a_v, + const __m256i* Bp, + int32_t* C, + int remainder, + __m256i* a_sum = nullptr) { array<__m256i, 4> c, c_temp; array<__m256i, 2> a_sum_temp{}; int k = 0; if (K >= 4) { - madd_epi16x4_packed(a_v[0], a_v[1], a_v[2], a_v[3], Bp, - &c[0], &c[1], &c[2], &c[3], a_sum_temp.data()); + madd_epi16x4_packed( + a_v[0], + a_v[1], + a_v[2], + a_v[3], + Bp, + &c[0], + &c[1], + &c[2], + &c[3], + a_sum_temp.data()); for (k = 4; k < K / 4 * 4; k += 4) { - madd_epi16x4_packed(a_v[k + 0], a_v[k + 1], a_v[k + 2], a_v[k + 3], - Bp + k, &c_temp[0], &c_temp[1], &c_temp[2], - &c_temp[3], a_sum_temp.data()); + madd_epi16x4_packed( + a_v[k + 0], + a_v[k + 1], + a_v[k + 2], + a_v[k + 3], + Bp + k, + &c_temp[0], + &c_temp[1], + &c_temp[2], + &c_temp[3], + a_sum_temp.data()); c[0] = _mm256_add_epi32(c[0], c_temp[0]); c[1] = _mm256_add_epi32(c[1], c_temp[1]); @@ -383,9 +425,16 @@ inner_prod_packed_(const __m256i *a_v, const __m256i *Bp, int32_t *C, } if (K - k == 3) { - madd_epi16x3_packed(a_v[k], a_v[k + 1], a_v[k + 2], Bp + k, - &c_temp[0], &c_temp[1], &c_temp[2], &c_temp[3], - a_sum_temp.data()); + madd_epi16x3_packed( + a_v[k], + a_v[k + 1], + a_v[k + 2], + Bp + k, + &c_temp[0], + &c_temp[1], + &c_temp[2], + &c_temp[3], + a_sum_temp.data()); c[0] = _mm256_add_epi32(c[0], c_temp[0]); c[1] = _mm256_add_epi32(c[1], c_temp[1]); @@ -405,11 +454,18 @@ inner_prod_packed_(const __m256i *a_v, const __m256i *Bp, int32_t *C, c[3] = c_temp[3]; } else { if (K - k == 1) { - madd_epi16_packed(a_v[k], Bp + k, &c[0], &c[1], &c[2], &c[3], - a_sum_temp.data()); + madd_epi16_packed( + a_v[k], Bp + k, &c[0], &c[1], &c[2], &c[3], a_sum_temp.data()); } else if (K - k == 2) { - madd_epi16x2_packed(a_v[k], a_v[k + 1], Bp + k, &c[0], &c[1], - &c[2], &c[3], a_sum_temp.data()); + madd_epi16x2_packed( + a_v[k], + a_v[k + 1], + Bp + k, + &c[0], + &c[1], + &c[2], + &c[3], + a_sum_temp.data()); } c[0] = _mm256_add_epi32(c[0], c_temp[0]); @@ -422,37 +478,37 @@ inner_prod_packed_(const __m256i *a_v, const __m256i *Bp, int32_t *C, for (int r = 0; r < remainder / 8; ++r) { if (ACC) { _mm256_storeu_si256( - reinterpret_cast<__m256i *>(C + r * 8), + reinterpret_cast<__m256i*>(C + r * 8), _mm256_add_epi32( - _mm256_loadu_si256(reinterpret_cast<__m256i *>(C + r * 8)), + _mm256_loadu_si256(reinterpret_cast<__m256i*>(C + r * 8)), c[r])); } else { - _mm256_storeu_si256(reinterpret_cast<__m256i *>(C + r * 8), c[r]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(C + r * 8), c[r]); } } } else { if (ACC) { _mm256_storeu_si256( - reinterpret_cast<__m256i *>(C), - _mm256_add_epi32(_mm256_loadu_si256(reinterpret_cast<__m256i *>(C)), - c[0])); + reinterpret_cast<__m256i*>(C), + _mm256_add_epi32( + _mm256_loadu_si256(reinterpret_cast<__m256i*>(C)), c[0])); _mm256_storeu_si256( - reinterpret_cast<__m256i *>(C + 8), + reinterpret_cast<__m256i*>(C + 8), _mm256_add_epi32( - _mm256_loadu_si256(reinterpret_cast<__m256i *>(C + 8)), c[1])); + _mm256_loadu_si256(reinterpret_cast<__m256i*>(C + 8)), c[1])); _mm256_storeu_si256( - reinterpret_cast<__m256i *>(C + 16), + reinterpret_cast<__m256i*>(C + 16), _mm256_add_epi32( - _mm256_loadu_si256(reinterpret_cast<__m256i *>(C + 16)), c[2])); + _mm256_loadu_si256(reinterpret_cast<__m256i*>(C + 16)), c[2])); _mm256_storeu_si256( - reinterpret_cast<__m256i *>(C + 24), + reinterpret_cast<__m256i*>(C + 24), _mm256_add_epi32( - _mm256_loadu_si256(reinterpret_cast<__m256i *>(C + 24)), c[3])); + _mm256_loadu_si256(reinterpret_cast<__m256i*>(C + 24)), c[3])); } else { - _mm256_storeu_si256(reinterpret_cast<__m256i *>(C), c[0]); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(C + 8), c[1]); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(C + 16), c[2]); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(C + 24), c[3]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(C), c[0]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(C + 8), c[1]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(C + 16), c[2]); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(C + 24), c[3]); } } @@ -467,14 +523,13 @@ inner_prod_packed_(const __m256i *a_v, const __m256i *Bp, int32_t *C, } template -static inline __attribute__((always_inline)) -void inner_prod_3x3_packed_(const __m256i* a_v, - const __m256i* Bp, - int32_t* C, - int remainder, - __m256i* a_sum = nullptr) { - return inner_prod_packed_<9, SUM_A, REMAINDER>(a_v, Bp, C, remainder, - a_sum); +static inline __attribute__((always_inline)) void inner_prod_3x3_packed_( + const __m256i* a_v, + const __m256i* Bp, + int32_t* C, + int remainder, + __m256i* a_sum = nullptr) { + return inner_prod_packed_<9, SUM_A, REMAINDER>(a_v, Bp, C, remainder, a_sum); } // Almost same as ReQuantizeOutput in OutputProcessing-inh.h but different @@ -507,7 +562,7 @@ static inline __attribute__((always_inline)) void requantize_( constexpr int VLEN = 8; int j = 0; - for ( ; j < n / (VLEN * 4) * (VLEN * 4); j += (VLEN * 4)) { + for (; j < n / (VLEN * 4) * (VLEN * 4); j += (VLEN * 4)) { __m256i x_v = _mm256_loadu_si256(reinterpret_cast(C_int32 + j)); __m256i y_v = _mm256_loadu_si256( @@ -519,10 +574,9 @@ static inline __attribute__((always_inline)) void requantize_( __m256i col_off_v = _mm256_mullo_epi32( A_zero_point_v, - _mm256_loadu_si256( - reinterpret_cast(col_offsets + j))); + _mm256_loadu_si256(reinterpret_cast(col_offsets + j))); __m256i row_offset_v = - _mm256_loadu_si256(reinterpret_cast(row_offsets + j)); + _mm256_loadu_si256(reinterpret_cast(row_offsets + j)); x_v = _mm256_sub_epi32(_mm256_sub_epi32(x_v, col_off_v), row_offset_v); col_off_v = _mm256_mullo_epi32( @@ -535,23 +589,23 @@ static inline __attribute__((always_inline)) void requantize_( col_off_v = _mm256_mullo_epi32( A_zero_point_v, - _mm256_loadu_si256(reinterpret_cast( - col_offsets + j + 2 * VLEN))); + _mm256_loadu_si256( + reinterpret_cast(col_offsets + j + 2 * VLEN))); row_offset_v = _mm256_loadu_si256( reinterpret_cast(row_offsets + j + 2 * VLEN)); z_v = _mm256_sub_epi32(_mm256_sub_epi32(z_v, col_off_v), row_offset_v); col_off_v = _mm256_mullo_epi32( A_zero_point_v, - _mm256_loadu_si256(reinterpret_cast( - col_offsets + j + 3 * VLEN))); + _mm256_loadu_si256( + reinterpret_cast(col_offsets + j + 3 * VLEN))); row_offset_v = _mm256_loadu_si256( reinterpret_cast(row_offsets + j + 3 * VLEN)); w_v = _mm256_sub_epi32(_mm256_sub_epi32(w_v, col_off_v), row_offset_v); if (HAS_BIAS) { // static if x_v = _mm256_add_epi32( - x_v, _mm256_loadu_si256(reinterpret_cast(bias + j))); + x_v, _mm256_loadu_si256(reinterpret_cast(bias + j))); y_v = _mm256_add_epi32( y_v, _mm256_loadu_si256( @@ -604,21 +658,20 @@ static inline __attribute__((always_inline)) void requantize_( reinterpret_cast<__m256i*>(C_uint8 + j), xyzw_clamped_v); } // j loop vectorized and unrolled 4x - for ( ; j < n / VLEN * VLEN; j += VLEN) { + for (; j < n / VLEN * VLEN; j += VLEN) { __m256i x_v = _mm256_loadu_si256(reinterpret_cast(C_int32 + j)); __m256i col_off_v = _mm256_mullo_epi32( A_zero_point_v, - _mm256_loadu_si256( - reinterpret_cast(col_offsets + j))); + _mm256_loadu_si256(reinterpret_cast(col_offsets + j))); __m256i row_offset_v = - _mm256_loadu_si256(reinterpret_cast(row_offsets + j)); + _mm256_loadu_si256(reinterpret_cast(row_offsets + j)); x_v = _mm256_sub_epi32(_mm256_sub_epi32(x_v, col_off_v), row_offset_v); if (HAS_BIAS) { // static if x_v = _mm256_add_epi32( - x_v, _mm256_loadu_si256(reinterpret_cast(bias + j))); + x_v, _mm256_loadu_si256(reinterpret_cast(bias + j))); } if (PER_CHANNEL_QUANTIZATION) { @@ -635,15 +688,14 @@ static inline __attribute__((always_inline)) void requantize_( FUSE_RELU ? C_zero_point_epi8_v : min_v, _mm256_min_epu8(x_packed_v, max_v)); - x_clamped_v = - _mm256_permutevar8x32_epi32(x_clamped_v, permute_mask_v); + x_clamped_v = _mm256_permutevar8x32_epi32(x_clamped_v, permute_mask_v); _mm_storel_epi64( reinterpret_cast<__m128i*>(C_uint8 + j), _mm256_castsi256_si128(x_clamped_v)); } // j loop vectorized - for ( ; j < n; ++j) { + for (; j < n; ++j) { int32_t raw = C_int32[j] - A_zero_point * col_offsets[j] - row_offsets[j]; if (HAS_BIAS) { // static if raw += bias[j]; @@ -659,11 +711,16 @@ static inline __attribute__((always_inline)) void requantize_( } template -static inline __attribute__((always_inline)) void -requantize_(int32_t A_zero_point, float C_multiplier, - int32_t C_zero_point, const int32_t *C_int32, uint8_t *C_uint8, - int n, const int32_t *row_offsets, const int32_t *col_offsets, - const int32_t *bias) { +static inline __attribute__((always_inline)) void requantize_( + int32_t A_zero_point, + float C_multiplier, + int32_t C_zero_point, + const int32_t* C_int32, + uint8_t* C_uint8, + int n, + const int32_t* row_offsets, + const int32_t* col_offsets, + const int32_t* bias) { requantize_( A_zero_point, &C_multiplier, @@ -677,11 +734,16 @@ requantize_(int32_t A_zero_point, float C_multiplier, } template -static inline __attribute__((always_inline)) void -requantize_per_channel_(int32_t A_zero_point, const float *C_multiplier, - int32_t C_zero_point, const int32_t *C_int32, - uint8_t *C_uint8, int n, const int32_t *row_offsets, - const int32_t *col_offsets, const int32_t *bias) { +static inline __attribute__((always_inline)) void requantize_per_channel_( + int32_t A_zero_point, + const float* C_multiplier, + int32_t C_zero_point, + const int32_t* C_int32, + uint8_t* C_uint8, + int n, + const int32_t* row_offsets, + const int32_t* col_offsets, + const int32_t* bias) { requantize_( A_zero_point, C_multiplier, @@ -695,27 +757,38 @@ requantize_per_channel_(int32_t A_zero_point, const float *C_multiplier, } template -static inline __attribute__((always_inline)) __m256i -load_a(const uint8_t* A, __m256i mask_v) { +static inline __attribute__((always_inline)) __m256i load_a( + const uint8_t* A, + __m256i mask_v) { if (REMAINDER) { - return _mm256_maskload_epi32(reinterpret_cast(A), mask_v); + return _mm256_maskload_epi32(reinterpret_cast(A), mask_v); } else { - return _mm256_lddqu_si256(reinterpret_cast(A)); + return _mm256_lddqu_si256(reinterpret_cast(A)); } } -template -static inline __attribute__((always_inline)) void -inner_prod_3x3_packed_(int H, int W, int K, int h_in, int w_in, - const uint8_t *A, int32_t A_zero_point, const int8_t *Bp, - const int32_t *B_zero_point, int32_t *C, int remainder, - int32_t *row_offsets) { +template < + bool SUM_A, + bool REMAINDER = false, + bool PER_CHANNEL_QUANTIZATION = false> +static inline __attribute__((always_inline)) void inner_prod_3x3_packed_( + int H, + int W, + int K, + int h_in, + int w_in, + const uint8_t* A, + int32_t A_zero_point, + const int8_t* Bp, + const int32_t* B_zero_point, + int32_t* C, + int remainder, + int32_t* row_offsets) { __m256i A_zero_point_v = _mm256_set1_epi8(static_cast(A_zero_point)); __m256i mask_v = _mm256_setzero_si256(); if (REMAINDER) { mask_v = _mm256_loadu_si256( - reinterpret_cast(masks[remainder / 4].data())); + reinterpret_cast(masks[remainder / 4].data())); } // The code below can be written as a simple R*S loop but the compiler @@ -739,15 +812,15 @@ inner_prod_3x3_packed_(int H, int W, int K, int h_in, int w_in, // } // } array<__m256i, 9> a_v = { - A_zero_point_v, - A_zero_point_v, - A_zero_point_v, - A_zero_point_v, - A_zero_point_v, - A_zero_point_v, - A_zero_point_v, - A_zero_point_v, - A_zero_point_v, + A_zero_point_v, + A_zero_point_v, + A_zero_point_v, + A_zero_point_v, + A_zero_point_v, + A_zero_point_v, + A_zero_point_v, + A_zero_point_v, + A_zero_point_v, }; if (h_in >= 0 && h_in < H) { @@ -788,35 +861,51 @@ inner_prod_3x3_packed_(int H, int W, int K, int h_in, int w_in, array<__m256i, 4> a_sum; inner_prod_3x3_packed_( - a_v.data(), reinterpret_cast(Bp), C, remainder, + a_v.data(), + reinterpret_cast(Bp), + C, + remainder, a_sum.data()); if (SUM_A) { __m256i B_zero_point_v; for (int i = 0; i < (REMAINDER ? (remainder / 8) : 4); ++i) { if (PER_CHANNEL_QUANTIZATION) { B_zero_point_v = _mm256_loadu_si256( - reinterpret_cast(B_zero_point + i * 8)); + reinterpret_cast(B_zero_point + i * 8)); } else { B_zero_point_v = _mm256_set1_epi32(B_zero_point[0]); } - _mm256_store_si256(reinterpret_cast<__m256i *>(&row_offsets[i * 8]), - _mm256_mullo_epi32(a_sum[i], B_zero_point_v)); + _mm256_store_si256( + reinterpret_cast<__m256i*>(&row_offsets[i * 8]), + _mm256_mullo_epi32(a_sum[i], B_zero_point_v)); } } } -template -static inline __attribute__((always_inline)) void -inner_prod_3x3x3_packed_(int T, int H, int W, int K, int t_in, int h_in, - int w_in, const uint8_t *A, int32_t A_zero_point, - const int8_t *Bp, const int32_t *B_zero_point, - int32_t *C, int remainder, int32_t *row_offsets) { +template < + bool SUM_A, + bool REMAINDER = false, + bool PER_CHANNEL_QUANTIZATION = false> +static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_( + int T, + int H, + int W, + int K, + int t_in, + int h_in, + int w_in, + const uint8_t* A, + int32_t A_zero_point, + const int8_t* Bp, + const int32_t* B_zero_point, + int32_t* C, + int remainder, + int32_t* row_offsets) { __m256i A_zero_point_v = _mm256_set1_epi8(static_cast(A_zero_point)); __m256i mask_v = _mm256_setzero_si256(); if (REMAINDER) { mask_v = _mm256_loadu_si256( - reinterpret_cast(masks[remainder / 4].data())); + reinterpret_cast(masks[remainder / 4].data())); } // The code below can be written as a simple R*S loop but the compiler @@ -885,9 +974,12 @@ inner_prod_3x3x3_packed_(int T, int H, int W, int K, int t_in, int h_in, } array<__m256i, 4> a_sum; - inner_prod_packed_<8, SUM_A, REMAINDER>(a_v.data(), - reinterpret_cast(Bp), - C, remainder, a_sum.data()); + inner_prod_packed_<8, SUM_A, REMAINDER>( + a_v.data(), + reinterpret_cast(Bp), + C, + remainder, + a_sum.data()); a_v[0] = A_zero_point_v; a_v[1] = A_zero_point_v; @@ -940,7 +1032,10 @@ inner_prod_3x3x3_packed_(int T, int H, int W, int K, int t_in, int h_in, array<__m256i, 4> a_sum_temp; inner_prod_packed_<8, SUM_A, REMAINDER, true /* acc */>( - a_v.data(), reinterpret_cast(Bp) + 8, C, remainder, + a_v.data(), + reinterpret_cast(Bp) + 8, + C, + remainder, a_sum_temp.data()); if (SUM_A) { a_sum[0] = _mm256_add_epi32(a_sum[0], a_sum_temp[0]); @@ -996,7 +1091,10 @@ inner_prod_3x3x3_packed_(int T, int H, int W, int K, int t_in, int h_in, } inner_prod_packed_<8, SUM_A, REMAINDER, true /* acc */>( - a_v.data(), reinterpret_cast(Bp) + 16, C, remainder, + a_v.data(), + reinterpret_cast(Bp) + 16, + C, + remainder, a_sum_temp.data()); if (SUM_A) { a_sum[0] = _mm256_add_epi32(a_sum[0], a_sum_temp[0]); @@ -1024,7 +1122,10 @@ inner_prod_3x3x3_packed_(int T, int H, int W, int K, int t_in, int h_in, } inner_prod_packed_<3, SUM_A, REMAINDER, true /* acc */>( - a_v.data(), reinterpret_cast(Bp) + 24, C, remainder, + a_v.data(), + reinterpret_cast(Bp) + 24, + C, + remainder, a_sum_temp.data()); if (SUM_A) { @@ -1037,29 +1138,37 @@ inner_prod_3x3x3_packed_(int T, int H, int W, int K, int t_in, int h_in, for (int i = 0; i < (REMAINDER ? (remainder / 8) : 4); ++i) { if (PER_CHANNEL_QUANTIZATION) { B_zero_point_v = _mm256_loadu_si256( - reinterpret_cast(B_zero_point + i * 8)); + reinterpret_cast(B_zero_point + i * 8)); } else { B_zero_point_v = _mm256_set1_epi32(B_zero_point[0]); } - _mm256_store_si256(reinterpret_cast<__m256i *>(&row_offsets[i * 8]), - _mm256_mullo_epi32(a_sum[i], B_zero_point_v)); + _mm256_store_si256( + reinterpret_cast<__m256i*>(&row_offsets[i * 8]), + _mm256_mullo_epi32(a_sum[i], B_zero_point_v)); } } } template -static inline __attribute__((always_inline)) -void depthwise_3x3_kernel_(int H, int W, int K, int h, int w, - int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t* A, - int32_t B_zero_point, const int8_t* Bp, - float C_multiplier, - int32_t C_zero_point, - int32_t* C_int32, uint8_t* C_uint8, - int32_t* row_offsets, - const int32_t *col_offsets, - const int32_t *bias) -{ +static inline __attribute__((always_inline)) void depthwise_3x3_kernel_( + int H, + int W, + int K, + int h, + int w, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + int32_t B_zero_point, + const int8_t* Bp, + float C_multiplier, + int32_t C_zero_point, + int32_t* C_int32, + uint8_t* C_uint8, + int32_t* row_offsets, + const int32_t* col_offsets, + const int32_t* bias) { constexpr int S = 3; constexpr int PAD_T = 1, PAD_L = 1, PAD_R = 1; int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; @@ -1069,43 +1178,72 @@ void depthwise_3x3_kernel_(int H, int W, int K, int h, int w, int k; for (k = 0; k < K / 32 * 32; k += 32) { inner_prod_3x3_packed_( - H, W, K, h_in, w_in, - A + (h_in * W + w_in) * K + k, A_zero_point, - Bp + k * 10, &B_zero_point, - C_int32 + k, 0, &row_offsets[k]); + H, + W, + K, + h_in, + w_in, + A + (h_in * W + w_in) * K + k, + A_zero_point, + Bp + k * 10, + &B_zero_point, + C_int32 + k, + 0, + &row_offsets[k]); } int remainder = K - k; if (remainder) { inner_prod_3x3_packed_( - H, W, K, h_in, w_in, - A + (h_in * W + w_in) * K + k, A_zero_point, - Bp + k * 10, &B_zero_point, - C_int32 + k, remainder, &row_offsets[k]); + H, + W, + K, + h_in, + w_in, + A + (h_in * W + w_in) * K + k, + A_zero_point, + Bp + k * 10, + &B_zero_point, + C_int32 + k, + remainder, + &row_offsets[k]); } if (SUM_A) { - requantize_ - ( - A_zero_point, C_multiplier, C_zero_point, - C_int32, C_uint8 + (h * W_OUT + w) * K, K, + requantize_( + A_zero_point, + C_multiplier, + C_zero_point, + C_int32, + C_uint8 + (h * W_OUT + w) * K, + K, row_offsets, - col_offsets, bias - ); + col_offsets, + bias); } } template -static inline __attribute__((always_inline)) -void depthwise_3x3x3_kernel_(int T, int H, int W, int K, int t, int h, int w, - int stride_t, int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t* A, - int32_t B_zero_point, const int8_t* Bp, - float C_multiplier, - int32_t C_zero_point, - int32_t* C_int32, uint8_t* C_uint8, - int32_t* row_offsets, - const int32_t *col_offsets, - const int32_t *bias) -{ +static inline __attribute__((always_inline)) void depthwise_3x3x3_kernel_( + int T, + int H, + int W, + int K, + int t, + int h, + int w, + int stride_t, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + int32_t B_zero_point, + const int8_t* Bp, + float C_multiplier, + int32_t C_zero_point, + int32_t* C_int32, + uint8_t* C_uint8, + int32_t* row_offsets, + const int32_t* col_offsets, + const int32_t* bias) { constexpr int R = 3, S = 3; constexpr int PAD_P = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1; @@ -1117,39 +1255,74 @@ void depthwise_3x3x3_kernel_(int T, int H, int W, int K, int t, int h, int w, int k; for (k = 0; k < K / 32 * 32; k += 32) { inner_prod_3x3x3_packed_( - T, H, W, K, t_in, h_in, w_in, - A + ((t_in * H + h_in) * W + w_in) * K + k, A_zero_point, - Bp + k * 28, &B_zero_point, - C_int32 + k, 0, &row_offsets[k]); + T, + H, + W, + K, + t_in, + h_in, + w_in, + A + ((t_in * H + h_in) * W + w_in) * K + k, + A_zero_point, + Bp + k * 28, + &B_zero_point, + C_int32 + k, + 0, + &row_offsets[k]); } int remainder = K - k; if (remainder) { inner_prod_3x3x3_packed_( - T, H, W, K, t_in, h_in, w_in, - A + ((t_in * H + h_in) * W + w_in) * K + k, A_zero_point, - Bp + k * 28, &B_zero_point, - C_int32 + k, remainder, &row_offsets[k]); + T, + H, + W, + K, + t_in, + h_in, + w_in, + A + ((t_in * H + h_in) * W + w_in) * K + k, + A_zero_point, + Bp + k * 28, + &B_zero_point, + C_int32 + k, + remainder, + &row_offsets[k]); } if (SUM_A) { - requantize_ - ( - A_zero_point, C_multiplier, C_zero_point, - C_int32, C_uint8 + ((t * H_OUT + h) * W_OUT + w) * K, K, + requantize_( + A_zero_point, + C_multiplier, + C_zero_point, + C_int32, + C_uint8 + ((t * H_OUT + h) * W_OUT + w) * K, + K, row_offsets, - col_offsets, bias - ); + col_offsets, + bias); } } template static inline __attribute__((always_inline)) void depthwise_3x3_per_channel_quantization_kernel_( - int H, int W, int K, int h, int w, int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t *A, - const int32_t *B_zero_point, const int8_t *Bp, - const float *C_multiplier, int32_t C_zero_point, - int32_t *C_int32, uint8_t *C_uint8, - int32_t *row_offsets, const int32_t *col_offsets, const int32_t *bias) { + int H, + int W, + int K, + int h, + int w, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + const int32_t* B_zero_point, + const int8_t* Bp, + const float* C_multiplier, + int32_t C_zero_point, + int32_t* C_int32, + uint8_t* C_uint8, + int32_t* row_offsets, + const int32_t* col_offsets, + const int32_t* bias) { constexpr int S = 3; constexpr int PAD_T = 1, PAD_L = 1, PAD_R = 1; int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; @@ -1158,28 +1331,47 @@ depthwise_3x3_per_channel_quantization_kernel_( int k; for (k = 0; k < K / 32 * 32; k += 32) { - inner_prod_3x3_packed_( - H, W, K, h_in, w_in, - A + (h_in * W + w_in) * K + k, A_zero_point, - Bp + k * 10, B_zero_point + k, - C_int32 + k, 0, &row_offsets[k]); + inner_prod_3x3_packed_( + H, + W, + K, + h_in, + w_in, + A + (h_in * W + w_in) * K + k, + A_zero_point, + Bp + k * 10, + B_zero_point + k, + C_int32 + k, + 0, + &row_offsets[k]); } int remainder = K - k; if (remainder) { - inner_prod_3x3_packed_( - H, W, K, h_in, w_in, - A + (h_in * W + w_in) * K + k, A_zero_point, - Bp + k * 10, B_zero_point + k, - C_int32 + k, remainder, &row_offsets[k]); + inner_prod_3x3_packed_( + H, + W, + K, + h_in, + w_in, + A + (h_in * W + w_in) * K + k, + A_zero_point, + Bp + k * 10, + B_zero_point + k, + C_int32 + k, + remainder, + &row_offsets[k]); } if (SUM_A) { - requantize_per_channel_ - ( - A_zero_point, C_multiplier, C_zero_point, - C_int32, C_uint8 + (h * W_OUT + w) * K, K, + requantize_per_channel_( + A_zero_point, + C_multiplier, + C_zero_point, + C_int32, + C_uint8 + (h * W_OUT + w) * K, + K, row_offsets, - col_offsets, bias - ); + col_offsets, + bias); } } @@ -1188,7 +1380,7 @@ static pair closest_factors_(int n) { while (n % a != 0) { a--; } - return { a, n / a }; // a <= n / a + return {a, n / a}; // a <= n / a } // TODO: short-circuit when B_zero_point is 0 or A_zero_point is 0 @@ -1196,16 +1388,25 @@ static pair closest_factors_(int n) { // filter shapes by parameterizing with R and S but restricting it to just 3x3 // for now. template -static inline __attribute__((always_inline)) -void depthwise_3x3_pad_1_(int N, int H, int W, int K, - int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t *A, - int32_t B_zero_point, const Packed3x3ConvMatrix &B, - float C_multiplier, - int32_t C_zero_point, - int32_t* C_int32, uint8_t* C_uint8, - const int32_t *col_offsets, const int32_t *bias, - int thread_id, int num_threads) { +static inline __attribute__((always_inline)) void depthwise_3x3_pad_1_( + int N, + int H, + int W, + int K, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + int32_t B_zero_point, + const Packed3x3ConvMatrix& B, + float C_multiplier, + int32_t C_zero_point, + int32_t* C_int32, + uint8_t* C_uint8, + const int32_t* col_offsets, + const int32_t* bias, + int thread_id, + int num_threads) { assert(K % 8 == 0); constexpr int R = 3, S = 3; constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; @@ -1213,8 +1414,8 @@ void depthwise_3x3_pad_1_(int N, int H, int W, int K, int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t row_offsets[(K + 31) / 32 * 32] __attribute__ ((aligned (64))); - int32_t *C_temp; + int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + int32_t* C_temp; int n_begin, n_end; int h_begin, h_end, w_begin, w_end; @@ -1266,22 +1467,48 @@ void depthwise_3x3_pad_1_(int N, int H, int W, int K, C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) { C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } if (w_end == W_OUT) { @@ -1289,11 +1516,24 @@ void depthwise_3x3_pad_1_(int N, int H, int W, int K, C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } } @@ -1303,22 +1543,48 @@ void depthwise_3x3_pad_1_(int N, int H, int W, int K, C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) { C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } if (w_end == W_OUT) { @@ -1326,11 +1592,24 @@ void depthwise_3x3_pad_1_(int N, int H, int W, int K, C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } } @@ -1341,22 +1620,48 @@ void depthwise_3x3_pad_1_(int N, int H, int W, int K, C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) { C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } if (w_end == W_OUT) { @@ -1364,28 +1669,51 @@ void depthwise_3x3_pad_1_(int N, int H, int W, int K, C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } } } // for each n }; template -static inline __attribute__((always_inline)) -void depthwise_3x3x3_pad_1_(int N, int T, int H, int W, int K, - int stride_t, int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t *A, - int32_t B_zero_point, - const Packed3x3x3ConvMatrix &B, - float C_multiplier, - int32_t C_zero_point, - int32_t* C_int32, uint8_t* C_uint8, - const int32_t *col_offsets, const int32_t *bias, - int thread_id, int num_threads) { +static inline __attribute__((always_inline)) void depthwise_3x3x3_pad_1_( + int N, + int T, + int H, + int W, + int K, + int stride_t, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + int32_t B_zero_point, + const Packed3x3x3ConvMatrix& B, + float C_multiplier, + int32_t C_zero_point, + int32_t* C_int32, + uint8_t* C_uint8, + const int32_t* col_offsets, + const int32_t* bias, + int thread_id, + int num_threads) { assert(K % 8 == 0); constexpr int K_T = 3, K_H = 3, K_W = 3; constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, @@ -1395,8 +1723,8 @@ void depthwise_3x3x3_pad_1_(int N, int T, int H, int W, int K, int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t row_offsets[(K + 31) / 32 * 32] __attribute__ ((aligned (64))); - int32_t *C_temp; + int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + int32_t* C_temp; int n_begin, n_end; int t_begin, t_end, h_begin, h_end; @@ -1443,14 +1771,30 @@ void depthwise_3x3x3_pad_1_(int N, int T, int H, int W, int K, for (int t = t_begin; t < t_end; ++t) { for (int h = h_begin; h < h_end; ++h) { for (int w = 0; w < W_OUT; ++w) { - C_temp = - FUSE_RESCALE - ? C_int32 - : C_int32 + (((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K; + C_temp = FUSE_RESCALE + ? C_int32 + : C_int32 + (((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K; depthwise_3x3x3_kernel_( - T, H, W, K, t, h, w, stride_t, stride_h, stride_w, A_zero_point, - A_base, B_zero_point, Bp, C_multiplier, - C_zero_point, C_temp, C_uint8_base, row_offsets, col_offsets, + T, + H, + W, + K, + t, + h, + w, + stride_t, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, bias); } // w } // h @@ -1461,11 +1805,23 @@ void depthwise_3x3x3_pad_1_(int N, int T, int H, int W, int K, template static inline __attribute__((always_inline)) void depthwise_3x3_per_channel_quantization_pad_1_( - int N, int H, int W, int K, int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t *A, const int32_t *B_zero_point, - const Packed3x3ConvMatrix &B, const float *C_multiplier, - int32_t C_zero_point, int32_t *C_int32, uint8_t *C_uint8, - const int32_t *col_offsets, const int32_t *bias, int thread_id, + int N, + int H, + int W, + int K, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + const int32_t* B_zero_point, + const Packed3x3ConvMatrix& B, + const float* C_multiplier, + int32_t C_zero_point, + int32_t* C_int32, + uint8_t* C_uint8, + const int32_t* col_offsets, + const int32_t* bias, + int thread_id, int num_threads) { assert(K % 8 == 0); constexpr int R = 3, S = 3; @@ -1474,8 +1830,8 @@ depthwise_3x3_per_channel_quantization_pad_1_( int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t row_offsets[(K + 31) / 32 * 32] __attribute__ ((aligned (64))); - int32_t *C_temp; + int32_t row_offsets[(K + 31) / 32 * 32] __attribute__((aligned(64))); + int32_t* C_temp; int n_begin, n_end; int h_begin, h_end, w_begin, w_end; @@ -1527,22 +1883,48 @@ depthwise_3x3_per_channel_quantization_pad_1_( C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_per_channel_quantization_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) { C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_per_channel_quantization_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } if (w_end == W_OUT) { @@ -1550,11 +1932,24 @@ depthwise_3x3_per_channel_quantization_pad_1_( C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_per_channel_quantization_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } } @@ -1564,22 +1959,48 @@ depthwise_3x3_per_channel_quantization_pad_1_( C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_per_channel_quantization_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) { C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_per_channel_quantization_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } if (w_end == W_OUT) { @@ -1587,11 +2008,24 @@ depthwise_3x3_per_channel_quantization_pad_1_( C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_per_channel_quantization_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } } @@ -1602,22 +2036,48 @@ depthwise_3x3_per_channel_quantization_pad_1_( C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_per_channel_quantization_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } for (w = std::max(1, w_begin); w < std::min(W_OUT - 1, w_end); ++w) { C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_per_channel_quantization_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } if (w_end == W_OUT) { @@ -1625,11 +2085,24 @@ depthwise_3x3_per_channel_quantization_pad_1_( C_temp = FUSE_RESCALE ? C_int32 : C_int32 + ((n * H_OUT + h) * W_OUT + w) * K; depthwise_3x3_per_channel_quantization_kernel_( - H, W, K, h, w, stride_h, stride_w, - A_zero_point, A_base, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_temp, C_uint8_base, - row_offsets, col_offsets, bias); + H, + W, + K, + h, + w, + stride_h, + stride_w, + A_zero_point, + A_base, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_temp, + C_uint8_base, + row_offsets, + col_offsets, + bias); } } } // for each n @@ -1643,163 +2116,338 @@ void depthwise_3x3_pad_1( int K, int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t* A, + int32_t A_zero_point, + const uint8_t* A, const Packed3x3ConvMatrix& B, int32_t* C, int thread_id, int num_threads) { if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - 0, B, - 0.0f, 0, C, nullptr, - nullptr, nullptr, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + 0, + B, + 0.0f, + 0, + C, + nullptr, + nullptr, + nullptr, + thread_id, + num_threads); } else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - 0, B, - 0.0f, 0, C, nullptr, - nullptr, nullptr, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + 0, + B, + 0.0f, + 0, + C, + nullptr, + nullptr, + nullptr, + thread_id, + num_threads); } else if (1 == stride_h && 1 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - 0, B, - 0.0f, 0, C, nullptr, - nullptr, nullptr, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + 0, + B, + 0.0f, + 0, + C, + nullptr, + nullptr, + nullptr, + thread_id, + num_threads); } else if (2 == stride_h && 2 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - 0, B, - 0.0f, 0, C, nullptr, - nullptr, nullptr, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + 0, + B, + 0.0f, + 0, + C, + nullptr, + nullptr, + nullptr, + thread_id, + num_threads); } else { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - 0, B, - 0.0f, 0, C, nullptr, - nullptr, nullptr, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + 0, + B, + 0.0f, + 0, + C, + nullptr, + nullptr, + nullptr, + thread_id, + num_threads); } } void depthwise_3x3_pad_1( - int N, int H, int W, int K, - int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t* A, - int32_t B_zero_point, const Packed3x3ConvMatrix& B, - float C_multiplier, int32_t C_zero_point, uint8_t* C, + int N, + int H, + int W, + int K, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + int32_t B_zero_point, + const Packed3x3ConvMatrix& B, + float C_multiplier, + int32_t C_zero_point, + uint8_t* C, const int32_t* col_offsets, const int32_t* bias, - int thread_id, int num_threads, + int thread_id, + int num_threads, bool fuse_relu) { int32_t C_int32_temp[(K + 31) / 32 * 32]; if (fuse_relu) { if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else if (1 == stride_h && 1 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else if (2 == stride_h && 2 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } } else { if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else if (1 == stride_h && 1 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else if (2 == stride_h && 2 == stride_w) { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else { depthwise_3x3_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } } } @@ -1813,140 +2461,309 @@ void depthwise_3x3x3_pad_1( int stride_t, int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t* A, + int32_t A_zero_point, + const uint8_t* A, const Packed3x3x3ConvMatrix& B, int32_t* C, int thread_id, int num_threads) { depthwise_3x3x3_pad_1_( - N, T, H, W, K, - stride_t, stride_h, stride_w, - A_zero_point, A, - 0, B, - 0.0f, 0, C, nullptr, - nullptr, nullptr, - thread_id, num_threads); + N, + T, + H, + W, + K, + stride_t, + stride_h, + stride_w, + A_zero_point, + A, + 0, + B, + 0.0f, + 0, + C, + nullptr, + nullptr, + nullptr, + thread_id, + num_threads); } static void depthwise_3x3x3_pad_1_( - int N, int T, int H, int W, int K, - int stride_t, int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t* A, - int32_t B_zero_point, const Packed3x3x3ConvMatrix& B, - float C_multiplier, int32_t C_zero_point, uint8_t* C, + int N, + int T, + int H, + int W, + int K, + int stride_t, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + int32_t B_zero_point, + const Packed3x3x3ConvMatrix& B, + float C_multiplier, + int32_t C_zero_point, + uint8_t* C, const int32_t* col_offsets, const int32_t* bias, - int thread_id, int num_threads) { + int thread_id, + int num_threads) { int32_t C_int32_temp[(K + 31) / 32 * 32]; depthwise_3x3x3_pad_1_( - N, T, H, W, K, - stride_t, stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + T, + H, + W, + K, + stride_t, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } static void depthwise_3x3x3_pad_1_relu_fused_( - int N, int T, int H, int W, int K, - int stride_t, int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t* A, - int32_t B_zero_point, const Packed3x3x3ConvMatrix& B, - float C_multiplier, int32_t C_zero_point, uint8_t* C, + int N, + int T, + int H, + int W, + int K, + int stride_t, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + int32_t B_zero_point, + const Packed3x3x3ConvMatrix& B, + float C_multiplier, + int32_t C_zero_point, + uint8_t* C, const int32_t* col_offsets, const int32_t* bias, - int thread_id, int num_threads) { + int thread_id, + int num_threads) { int32_t C_int32_temp[(K + 31) / 32 * 32]; depthwise_3x3x3_pad_1_( - N, T, H, W, K, - stride_t, stride_h, stride_w, - A_zero_point, A, - B_zero_point, B, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + T, + H, + W, + K, + stride_t, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } void depthwise_3x3x3_pad_1( - int N, int T, int H, int W, int K, - int stride_t, int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t* A, - int32_t B_zero_point, const Packed3x3x3ConvMatrix& B, - float C_multiplier, int32_t C_zero_point, uint8_t* C, + int N, + int T, + int H, + int W, + int K, + int stride_t, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + int32_t B_zero_point, + const Packed3x3x3ConvMatrix& B, + float C_multiplier, + int32_t C_zero_point, + uint8_t* C, const int32_t* col_offsets, const int32_t* bias, bool fuse_relu, - int thread_id, int num_threads) { + int thread_id, + int num_threads) { // If we inline the following two functions, I see stack overflow. if (fuse_relu) { depthwise_3x3x3_pad_1_relu_fused_( - N, T, H, W, K, stride_t, stride_h, stride_w, A_zero_point, A, - B_zero_point, B, C_multiplier, C_zero_point, C, - col_offsets, bias, thread_id, num_threads); + N, + T, + H, + W, + K, + stride_t, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C, + col_offsets, + bias, + thread_id, + num_threads); } else { - depthwise_3x3x3_pad_1_(N, T, H, W, K, stride_t, stride_h, stride_w, - A_zero_point, A, B_zero_point, B, C_multiplier, - C_zero_point, C, col_offsets, bias, - thread_id, num_threads); + depthwise_3x3x3_pad_1_( + N, + T, + H, + W, + K, + stride_t, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + B, + C_multiplier, + C_zero_point, + C, + col_offsets, + bias, + thread_id, + num_threads); } } void depthwise_3x3_per_channel_quantization_pad_1( - int N, int H, int W, int K, - int stride_h, int stride_w, - int32_t A_zero_point, const uint8_t* A, - const int32_t *B_zero_point, const Packed3x3ConvMatrix& Bp, - const float *C_multiplier, int32_t C_zero_point, uint8_t* C, + int N, + int H, + int W, + int K, + int stride_h, + int stride_w, + int32_t A_zero_point, + const uint8_t* A, + const int32_t* B_zero_point, + const Packed3x3ConvMatrix& Bp, + const float* C_multiplier, + int32_t C_zero_point, + uint8_t* C, const int32_t* col_offsets, const int32_t* bias, - int thread_id, int num_threads) { + int thread_id, + int num_threads) { int32_t C_int32_temp[(K + 31) / 32 * 32]; if (7 == H && 7 == W && 1 == stride_h && 1 == stride_w) { depthwise_3x3_per_channel_quantization_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else if (14 == H && 14 == W && 2 == stride_h && 2 == stride_w) { depthwise_3x3_per_channel_quantization_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else if (1 == stride_h && 1 == stride_w) { depthwise_3x3_per_channel_quantization_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else if (2 == stride_h && 2 == stride_w) { depthwise_3x3_per_channel_quantization_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } else { depthwise_3x3_per_channel_quantization_pad_1_( - N, H, W, K, - stride_h, stride_w, - A_zero_point, A, - B_zero_point, Bp, - C_multiplier, C_zero_point, C_int32_temp, C, - col_offsets, bias, - thread_id, num_threads); + N, + H, + W, + K, + stride_h, + stride_w, + A_zero_point, + A, + B_zero_point, + Bp, + C_multiplier, + C_zero_point, + C_int32_temp, + C, + col_offsets, + bias, + thread_id, + num_threads); } } diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc index 2ffe3ab..451592a 100644 --- a/src/GenerateKernelU8S8S32ACC16.cc +++ b/src/GenerateKernelU8S8S32ACC16.cc @@ -94,7 +94,7 @@ void CodeGenBase::storeCRegs< asmjit::X86Ymm extractDest256 = x86::ymm15; for (int i = 0; i < rowRegs; ++i) { - a->imul(C_Offset, ldcReg, i * sizeof(int32_t)); + a->imul(C_Offset, ldcReg, static_cast(i * sizeof(int32_t))); for (int j = 0; j < colRegs; ++j) { for (int idx = 0; idx < 2; ++idx) { a->vextracti128( @@ -214,17 +214,19 @@ CodeGenBase::getOrCreate( a->mov(kIdx, 0); a->bind(Loopk); // k is incremented by row_interleave - a->add(kIdx, row_interleave); + a->add(kIdx, static_cast(row_interleave)); genComputeBlock( a, buffer_A, buffer_B, B_pf, rowRegs, colRegs, kBlock); // update buffer_A address for next k iteration - a->add(buffer_A, row_interleave * sizeof(uint8_t)); + a->add( + buffer_A, static_cast(row_interleave * sizeof(uint8_t))); // update buffer_B address for next k iteration - a->add(buffer_B, VLEN_ * colRegs * sizeof(int8_t)); - // a->add(B_pf, VLEN_ * colRegs * sizeof(int8_t)); + a->add( + buffer_B, static_cast(VLEN_ * colRegs * sizeof(int8_t))); + // a->add(B_pf, static_cast(VLEN_ * colRegs * sizeof(int8_t))); a->cmp(kIdx, kSize); a->jl(Loopk); @@ -234,9 +236,11 @@ CodeGenBase::getOrCreate( // increment A for next block a->sub(buffer_A, kSize); - a->add(buffer_A, (rowRegs)*kBlock * sizeof(uint8_t)); + a->add( + buffer_A, static_cast((rowRegs)*kBlock * sizeof(uint8_t))); // increment C for next block - a->imul(C_Offset, ldcReg, rowRegs * sizeof(int32_t)); + a->imul( + C_Offset, ldcReg, static_cast(rowRegs * sizeof(int32_t))); a->add(CBase, C_Offset); // reset B a->mov(buffer_B, buffer_B_saved); @@ -258,16 +262,18 @@ CodeGenBase::getOrCreate( a->bind(LoopkRem); // k is incremented by row_interleave - a->add(kIdx, row_interleave); + a->add(kIdx, static_cast(row_interleave)); genComputeBlock( a, buffer_A, buffer_B, B_pf, rowRegs, colRegs, kBlock); // update buffer_A address for next k iteration - a->add(buffer_A, row_interleave * sizeof(uint8_t)); + a->add( + buffer_A, static_cast(row_interleave * sizeof(uint8_t))); // update buffer_B address for next k iteration - a->add(buffer_B, VLEN_ * colRegs * sizeof(int8_t)); + a->add( + buffer_B, static_cast(VLEN_ * colRegs * sizeof(int8_t))); // a->add(B_pf, VLEN_ * colRegs * sizeof(int8_t)); a->cmp(kIdx, kSize); diff --git a/src/GenerateKernelU8S8S32ACC16_avx512.cc b/src/GenerateKernelU8S8S32ACC16_avx512.cc index e613cf1..cab43ed 100644 --- a/src/GenerateKernelU8S8S32ACC16_avx512.cc +++ b/src/GenerateKernelU8S8S32ACC16_avx512.cc @@ -94,7 +94,7 @@ void CodeGenBase::storeCRegs< asmjit::X86Zmm extractDest512 = x86::zmm31; for (int i = 0; i < rowRegs; ++i) { - a->imul(C_Offset, ldcReg, i * sizeof(int32_t)); + a->imul(C_Offset, ldcReg, static_cast(i * sizeof(int32_t))); for (int j = 0; j < colRegs; ++j) { for (int idx = 0; idx < 2; ++idx) { a->vextracti32x8( @@ -215,17 +215,19 @@ CodeGenBase::getOrCreate( a->mov(kIdx, 0); a->bind(Loopk); // k is incremented by row_interleave - a->add(kIdx, row_interleave); + a->add(kIdx, static_cast(row_interleave)); genComputeBlock( a, buffer_A, buffer_B, B_pf, rowRegs, colRegs, kBlock); // update buffer_A address for next k iteration - a->add(buffer_A, row_interleave * sizeof(uint8_t)); + a->add( + buffer_A, static_cast(row_interleave * sizeof(uint8_t))); // update buffer_B address for next k iteration - a->add(buffer_B, VLEN_ * colRegs * sizeof(int8_t)); - // a->add(B_pf, VLEN_ * colRegs * sizeof(int8_t)); + a->add( + buffer_B, static_cast(VLEN_ * colRegs * sizeof(int8_t))); + // a->add(B_pf, static_cast(VLEN_ * colRegs * sizeof(int8_t))); a->cmp(kIdx, kSize); a->jl(Loopk); @@ -236,9 +238,11 @@ CodeGenBase::getOrCreate( // increment A for next block a->sub(buffer_A, kSize); - a->add(buffer_A, (rowRegs)*kBlock * sizeof(uint8_t)); + a->add( + buffer_A, static_cast((rowRegs)*kBlock * sizeof(uint8_t))); // increment C for next block - a->imul(C_Offset, ldcReg, rowRegs * sizeof(int32_t)); + a->imul( + C_Offset, ldcReg, static_cast(rowRegs * sizeof(int32_t))); a->add(CBase, C_Offset); // reset B a->mov(buffer_B, buffer_B_saved); @@ -260,17 +264,19 @@ CodeGenBase::getOrCreate( a->bind(LoopkRem); // k is incremented by row_interleave - a->add(kIdx, row_interleave); + a->add(kIdx, static_cast(row_interleave)); genComputeBlock( a, buffer_A, buffer_B, B_pf, rowRegs, colRegs, kBlock); // update buffer_A address for next k iteration - a->add(buffer_A, row_interleave * sizeof(uint8_t)); + a->add( + buffer_A, static_cast(row_interleave * sizeof(uint8_t))); // update buffer_B address for next k iteration - a->add(buffer_B, VLEN_ * colRegs * sizeof(int8_t)); - // a->add(B_pf, VLEN_ * colRegs * sizeof(int8_t)); + a->add( + buffer_B, static_cast(VLEN_ * colRegs * sizeof(int8_t))); + // a->add(B_pf, static_cast(VLEN_ * colRegs * sizeof(int8_t))); a->cmp(kIdx, kSize); a->jl(LoopkRem); diff --git a/src/GenerateKernelU8S8S32ACC32.cc b/src/GenerateKernelU8S8S32ACC32.cc index dc8c6d3..9529f5d 100644 --- a/src/GenerateKernelU8S8S32ACC32.cc +++ b/src/GenerateKernelU8S8S32ACC32.cc @@ -201,7 +201,7 @@ CodeGenBase::getOrCreate( // and so on a->vpcmpeqw(oneReg, oneReg, oneReg); a->vpsrlw(oneReg, oneReg, 15); - a->imul(ldcReg, ldcReg, sizeof(int32_t)); + a->imul(ldcReg, ldcReg, static_cast(sizeof(int32_t))); a->mov(C_Offset, 0); int colRegs = nc * row_interleave * sizeof(int8_t) / VLEN_; @@ -226,17 +226,19 @@ CodeGenBase::getOrCreate( a->bind(Loopk); // k is incremented by row_interleave - a->add(kIdx, row_interleave); + a->add(kIdx, static_cast(row_interleave)); genComputeBlock( a, buffer_A, buffer_B, B_pf, rowRegs, colRegs, kBlock, colRegs); // update buffer_A address for next k iteration - a->add(buffer_A, row_interleave * sizeof(uint8_t)); + a->add( + buffer_A, static_cast(row_interleave * sizeof(uint8_t))); // update buffer_B address for next k iteration - a->add(buffer_B, VLEN_ * colRegs * sizeof(int8_t)); - a->add(B_pf, VLEN_ * colRegs * sizeof(int8_t)); + a->add( + buffer_B, static_cast(VLEN_ * colRegs * sizeof(int8_t))); + a->add(B_pf, static_cast(VLEN_ * colRegs * sizeof(int8_t))); // a->add(B_pf, 32*sizeof(float)); @@ -249,10 +251,11 @@ CodeGenBase::getOrCreate( // increment A for next block a->sub(buffer_A, kSize); - a->add(buffer_A, (rowRegs)*kBlock * sizeof(uint8_t)); + a->add( + buffer_A, static_cast((rowRegs)*kBlock * sizeof(uint8_t))); // increment C for next block - a->imul(C_Offset, ldcReg, rowRegs); + a->imul(C_Offset, ldcReg, static_cast(rowRegs)); a->add(CBase, C_Offset); a->mov(C_Offset, 0); @@ -275,17 +278,19 @@ CodeGenBase::getOrCreate( a->bind(LoopkRem); // k is incremented by row_interleave - a->add(kIdx, row_interleave); + a->add(kIdx, static_cast(row_interleave)); genComputeBlock( a, buffer_A, buffer_B, B_pf, rowRegs, colRegs, kBlock, colRegs); // update buffer_A address for next k iteration - a->add(buffer_A, row_interleave * sizeof(uint8_t)); + a->add( + buffer_A, static_cast(row_interleave * sizeof(uint8_t))); // update buffer_B address for next k iteration - a->add(buffer_B, VLEN_ * colRegs * sizeof(int8_t)); - a->add(B_pf, VLEN_ * colRegs * sizeof(int8_t)); + a->add( + buffer_B, static_cast(VLEN_ * colRegs * sizeof(int8_t))); + a->add(B_pf, static_cast(VLEN_ * colRegs * sizeof(int8_t))); a->cmp(kIdx, kSize); a->jl(LoopkRem); diff --git a/src/GenerateKernelU8S8S32ACC32_avx512.cc b/src/GenerateKernelU8S8S32ACC32_avx512.cc index 5cd5684..251a8b8 100644 --- a/src/GenerateKernelU8S8S32ACC32_avx512.cc +++ b/src/GenerateKernelU8S8S32ACC32_avx512.cc @@ -203,7 +203,7 @@ CodeGenBase::getOrCreate( // a->vpcmpeqw(oneReg, oneReg, oneReg); a->vpternlogd(oneReg, oneReg, oneReg, 0xff); a->vpsrlw(oneReg, oneReg, 15); - a->imul(ldcReg, ldcReg, sizeof(int32_t)); + a->imul(ldcReg, ldcReg, static_cast(sizeof(int32_t))); a->mov(C_Offset, 0); int colRegs = nc * row_interleave * sizeof(int8_t) / VLEN_; @@ -228,19 +228,21 @@ CodeGenBase::getOrCreate( a->bind(Loopk); // k is incremented by row_interleave - a->add(kIdx, row_interleave); + a->add(kIdx, static_cast(row_interleave)); genComputeBlock( a, buffer_A, buffer_B, B_pf, rowRegs, colRegs, kBlock, colRegs); // update buffer_A address for next k iteration - a->add(buffer_A, row_interleave * sizeof(uint8_t)); + a->add( + buffer_A, static_cast(row_interleave * sizeof(uint8_t))); // update buffer_B address for next k iteration - a->add(buffer_B, VLEN_ * colRegs * sizeof(int8_t)); - a->add(B_pf, VLEN_ * colRegs * sizeof(int8_t)); + a->add( + buffer_B, static_cast(VLEN_ * colRegs * sizeof(int8_t))); + a->add(B_pf, static_cast(VLEN_ * colRegs * sizeof(int8_t))); - // a->add(B_pf, 32*sizeof(float)); + // a->add(B_pf, static_cast(32*sizeof(float))); a->cmp(kIdx, kSize); a->jl(Loopk); @@ -251,10 +253,11 @@ CodeGenBase::getOrCreate( // increment A for next block a->sub(buffer_A, kSize); - a->add(buffer_A, (rowRegs)*kBlock * sizeof(uint8_t)); + a->add( + buffer_A, static_cast((rowRegs)*kBlock * sizeof(uint8_t))); // increment C for next block - a->imul(C_Offset, ldcReg, rowRegs); + a->imul(C_Offset, ldcReg, static_cast(rowRegs)); a->add(CBase, C_Offset); a->mov(C_Offset, 0); @@ -277,17 +280,19 @@ CodeGenBase::getOrCreate( a->bind(LoopkRem); // k is incremented by row_interleave - a->add(kIdx, row_interleave); + a->add(kIdx, static_cast(row_interleave)); genComputeBlock( a, buffer_A, buffer_B, B_pf, rowRegs, colRegs, kBlock, colRegs); // update buffer_A address for next k iteration - a->add(buffer_A, row_interleave * sizeof(uint8_t)); + a->add( + buffer_A, static_cast(row_interleave * sizeof(uint8_t))); // update buffer_B address for next k iteration - a->add(buffer_B, VLEN_ * colRegs * sizeof(int8_t)); - a->add(B_pf, VLEN_ * colRegs * sizeof(int8_t)); + a->add( + buffer_B, static_cast(VLEN_ * colRegs * sizeof(int8_t))); + a->add(B_pf, static_cast(VLEN_ * colRegs * sizeof(int8_t))); a->cmp(kIdx, kSize); a->jl(LoopkRem); diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc index 543d99b..8f260ba 100644 --- a/src/PackAMatrix.cc +++ b/src/PackAMatrix.cc @@ -21,7 +21,7 @@ PackAMatrix::PackAMatrix( int32_t ld, inpType* pmat, int32_t groups, - accT zero_pt) + std::int32_t zero_pt) : PackMatrix, T, accT>(nRow, nCol, pmat, zero_pt), trans_(trans), smat_(smat), @@ -44,8 +44,8 @@ PackAMatrix::PackAMatrix( assert(0 && "unsupported architecure"); } if (!pmat) { - BaseType::buf_ = - (T*)aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)); + BaseType::buf_ = (T*)fbgemmAlignedAlloc( + 64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)); } } diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index a007685..e067a3e 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -49,7 +49,8 @@ PackAWithIm2Col::PackAWithIm2Col( } else { BaseType::bufAllocatedHere_ = true; BaseType::buf_ = static_cast( - aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T))); + fbgemmAlignedAlloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T))); + //aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T))); } if (row_offset) { rowOffsetAllocatedHere = false; @@ -57,7 +58,7 @@ PackAWithIm2Col::PackAWithIm2Col( } else { rowOffsetAllocatedHere = true; row_offset_ = static_cast( - aligned_alloc(64, BaseType::brow_ * sizeof(int32_t))); + fbgemmAlignedAlloc(64, BaseType::brow_ * sizeof(int32_t))); } } diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index 30d94f8..7b9ba41 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -21,7 +21,7 @@ PackBMatrix::PackBMatrix( int32_t ld, inpType* pmat, int32_t groups, - accT zero_pt) + std::int32_t zero_pt) : PackMatrix, T, accT>(nRow, nCol, pmat, zero_pt), trans_(trans), smat_(smat), @@ -46,7 +46,7 @@ PackBMatrix::PackBMatrix( BaseType::packedBlock(block); if (!pmat) { BaseType::bufAllocatedHere_ = true; - BaseType::buf_ = (T*)aligned_alloc( + BaseType::buf_ = (T*)fbgemmAlignedAlloc( 64, BaseType::blockRows() * BaseType::brow_ * BaseType::blockCols() * BaseType::bcol_ * sizeof(T)); diff --git a/src/PackWithQuantRowOffset.cc b/src/PackWithQuantRowOffset.cc index 74eaade..5f60faa 100644 --- a/src/PackWithQuantRowOffset.cc +++ b/src/PackWithQuantRowOffset.cc @@ -60,13 +60,13 @@ PackAWithQuantRowOffset::PackAWithQuantRowOffset( BaseType::buf_ = pmat; } else { BaseType::bufAllocatedHere_ = true; - BaseType::buf_ = - (T*)aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)); + BaseType::buf_ = (T*)fbgemmAlignedAlloc( + 64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)); } if (!row_offset_) { rowOffsetAllocatedHere = true; row_offset_ = reinterpret_cast( - aligned_alloc(64, BaseType::brow_ * sizeof(accT))); + fbgemmAlignedAlloc(64, BaseType::brow_ * sizeof(accT))); } } @@ -109,12 +109,40 @@ void PackAWithQuantRowOffset::pack(const block_type_t& block) { constexpr int VLEN = 8; __m256 inverse_scale_v = _mm256_set1_ps(1.0f / scale_); __m256i shuffle_mask_v = _mm256_set_epi8( - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x0c, 0x08, 0x04, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x0c, 0x08, 0x04, 0x00); - __m256i permute_mask_v = _mm256_set_epi32( - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0c, + 0x08, + 0x04, + 0x00, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0xff, + 0x0c, + 0x08, + 0x04, + 0x00); + __m256i permute_mask_v = + _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); #endif for (int i = 0; i < block.row_size; ++i) { diff --git a/src/PackWithRowOffset.cc b/src/PackWithRowOffset.cc index 8722723..fa1f2b0 100644 --- a/src/PackWithRowOffset.cc +++ b/src/PackWithRowOffset.cc @@ -4,12 +4,12 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ +#include #include #include #include #include #include -#include #include "fbgemm/Fbgemm.h" namespace fbgemm2 { @@ -50,20 +50,20 @@ PackAWithRowOffset::PackAWithRowOffset( row_interleave_B_ = PackingTraits::ROW_INTERLEAVE; } else { - //TODO: Have default slower path + // TODO: Have default slower path assert(0 && "unknown architecure"); } if (pmat) { BaseType::buf_ = pmat; } else { BaseType::bufAllocatedHere_ = true; - BaseType::buf_ = - (T*)aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)); + BaseType::buf_ = (T*)fbgemmAlignedAlloc( + 64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)); } if (!row_offset_) { rowOffsetAllocatedHere = true; - row_offset_ = static_cast(aligned_alloc(64, - BaseType::brow_ * sizeof(int32_t))); + row_offset_ = static_cast( + fbgemmAlignedAlloc(64, BaseType::brow_ * sizeof(int32_t))); } } @@ -89,8 +89,8 @@ void PackAWithRowOffset::pack(const block_type_t& block) { int32_t* row_offset_buf = getRowOffsetBuffer(); if (tr) { for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { - int32_t row_sum = row_offset_acc ? - row_offset_buf[i - block.row_start] : 0; + int32_t row_sum = + row_offset_acc ? row_offset_buf[i - block.row_start] : 0; for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { T val = smat_[i + ld_ * j]; row_sum += val; @@ -101,7 +101,8 @@ void PackAWithRowOffset::pack(const block_type_t& block) { // zero fill // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill. for (int j = block.col_start + block.col_size; - j < block_p.col_start + block_p.col_size; ++j) { + j < block_p.col_start + block_p.col_size; + ++j) { out[(i - block.row_start) * BaseType::blockColSize() + (j - block.col_start)] = 0; } @@ -117,8 +118,8 @@ void PackAWithRowOffset::pack(const block_type_t& block) { for (int j = block.col_size; j < block_p.col_size; ++j) { out[buf_idx * BaseType::blockColSize() + j] = 0; } - int32_t row_sum = row_offset_acc ? - row_offset_buf[i - block.row_start] : 0; + int32_t row_sum = + row_offset_acc ? row_offset_buf[i - block.row_start] : 0; __m256i sum_v = _mm256_setzero_si256(); __m256i one_epi16_v = _mm256_set1_epi16(1); __m256i one_epi8_v = _mm256_set1_epi8(1); @@ -137,8 +138,10 @@ void PackAWithRowOffset::pack(const block_type_t& block) { ++j) { row_sum += smat_[i * ld_ + j]; } - alignas(64) std::array temp; - _mm256_store_si256(reinterpret_cast<__m256i*>(temp.data()), sum_v); + // alignas(64) std::array temp; + alignas(64) std::int32_t temp[8]; + //_mm256_store_si256(reinterpret_cast<__m256i*>(temp.data()), sum_v); + _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v); for (int k = 0; k < 8; ++k) { row_sum += temp[k]; } @@ -190,13 +193,13 @@ void PackAWithRowOffset::printPackedMatrix(std::string name) { template int PackAWithRowOffset::rowOffsetBufferSize() { - if(cpuinfo_initialize()){ + if (cpuinfo_initialize()) { if (cpuinfo_has_x86_avx512f()) { return PackingTraits::MCB; } else if (cpuinfo_has_x86_avx2()) { return PackingTraits::MCB; } else { - //TODO: Have default slower path + // TODO: Have default slower path assert(0 && "unsupported architecture"); return -1; } diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index 10e581f..6bf2d65 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -9,6 +9,7 @@ #include #include #include +#include using namespace std; @@ -45,7 +46,7 @@ void requantize_u8acc32_ref( out[i * ld + j] = std::max( fuse_relu ? static_cast(C_zero_point) : 0l, - std::min(255l, rounded)); + std::min(static_cast(255l), rounded)); } } } -- cgit v1.2.3