diff options
-rw-r--r-- | include/fbgemm/Fbgemm.h | 12 | ||||
-rw-r--r-- | include/fbgemm/FbgemmFP16.h | 2 | ||||
-rw-r--r-- | include/fbgemm/Types.h | 6 | ||||
-rw-r--r-- | src/FbgemmI8Depthwise2DAvx2-inl.h | 34 | ||||
-rw-r--r-- | src/FbgemmI8Depthwise3x3Avx2.cc | 2 | ||||
-rw-r--r-- | src/FbgemmI8DepthwiseAvx2-inl.h | 14 | ||||
-rw-r--r-- | src/FbgemmI8DepthwisePerChannelQuantAvx2.cc | 2 | ||||
-rw-r--r-- | src/PackBMatrix.cc | 48 | ||||
-rw-r--r-- | src/QuantUtilsAvx2.cc | 3 | ||||
-rw-r--r-- | src/TransposeUtilsAvx2.h | 3 |
10 files changed, 94 insertions, 32 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index c5f9563..8f9f6cd 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -413,6 +413,18 @@ class FBGEMM_API PackBMatrix final const BlockingFactors* params = nullptr); /** + * This constructor accepts pre-packed matrix as an input. + * And, it skips the actual packing procedure. + */ + PackBMatrix(matrix_op_t trans, + std::int32_t nRow, + std::int32_t nCol, + inpType* prepackedmat, + std::int32_t ld, + int groups = 1, + const BlockingFactors* params = nullptr); + + /** * Weight matrices are usually constant so worth pre-packing. */ bool isPrePacked() const { diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h index ed56e31..7d1b031 100644 --- a/include/fbgemm/FbgemmFP16.h +++ b/include/fbgemm/FbgemmFP16.h @@ -223,7 +223,7 @@ class PackedGemmMatrixFP16 { } int matSize() const { - return size_; + return (int)size_; } int numRows() const { return nrow_; diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h index 370a29a..fda5262 100644 --- a/include/fbgemm/Types.h +++ b/include/fbgemm/Types.h @@ -36,11 +36,11 @@ static inline float16 cpu_float2half_rn(float f) { // Get rid of +Inf/-Inf, +0/-0. if (u > 0x477fefff) { - ret = sign | 0x7c00U; + ret = (fbgemm::float16)(sign | 0x7c00U); return ret; } if (u < 0x33000001) { - ret = (sign | 0x0000); + ret = (fbgemm::float16)(sign | 0x0000); return ret; } @@ -70,7 +70,7 @@ static inline float16 cpu_float2half_rn(float f) { } } - ret = (sign | (exponent << 10) | mantissa); + ret = (fbgemm::float16)(sign | (exponent << 10) | mantissa); return ret; } diff --git a/src/FbgemmI8Depthwise2DAvx2-inl.h b/src/FbgemmI8Depthwise2DAvx2-inl.h index 7488a3c..a07dcd7 100644 --- a/src/FbgemmI8Depthwise2DAvx2-inl.h +++ b/src/FbgemmI8Depthwise2DAvx2-inl.h @@ -318,7 +318,7 @@ static ALWAYS_INLINE void inner_prod_2d_packed_( std::int32_t* C, int remainder, std::int32_t* row_offsets) { - if (S == 3) { + //if constexpr (S == 3) { inner_prod_3x3_packed_<SUM_A, REMAINDER, PER_CHANNEL_QUANTIZATION>( H, W, @@ -332,22 +332,22 @@ static ALWAYS_INLINE void inner_prod_2d_packed_( C, remainder, row_offsets); - } else { - assert(S == 5); - inner_prod_5x5_packed_<SUM_A, REMAINDER, PER_CHANNEL_QUANTIZATION>( - H, - W, - K, - h_in, - w_in, - A, - A_zero_point, - Bp, - B_zero_point, - C, - remainder, - row_offsets); - } + //} else { + // assert(S == 5); + // inner_prod_5x5_packed_<SUM_A, REMAINDER, PER_CHANNEL_QUANTIZATION>( + // H, + // W, + // K, + // h_in, + // w_in, + // A, + // A_zero_point, + // Bp, + // B_zero_point, + // C, + // remainder, + // row_offsets); + //} } template < diff --git a/src/FbgemmI8Depthwise3x3Avx2.cc b/src/FbgemmI8Depthwise3x3Avx2.cc index b2ad049..523af73 100644 --- a/src/FbgemmI8Depthwise3x3Avx2.cc +++ b/src/FbgemmI8Depthwise3x3Avx2.cc @@ -4,7 +4,7 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ -#define FBGEMM_EXPORTS +//#define FBGEMM_EXPORTS #include "fbgemm/FbgemmI8DepthwiseAvx2.h" #include <stdexcept> // for logic_error diff --git a/src/FbgemmI8DepthwiseAvx2-inl.h b/src/FbgemmI8DepthwiseAvx2-inl.h index 18dda3b..7128916 100644 --- a/src/FbgemmI8DepthwiseAvx2-inl.h +++ b/src/FbgemmI8DepthwiseAvx2-inl.h @@ -230,7 +230,7 @@ static ALWAYS_INLINE void inner_prod_packed_( __m256i a_sum_temp[2] = {0, 0}; int k = 0; - if (K >= 4) { + //if (K >= 4) { madd_epi16x4_packed<SUM_A>( a_v[0], a_v[1], @@ -261,12 +261,12 @@ static ALWAYS_INLINE void inner_prod_packed_( c[2] = _mm256_add_epi32(c[2], c_temp[2]); c[3] = _mm256_add_epi32(c[3], c_temp[3]); } - } else { - c[0] = _mm256_setzero_si256(); - c[1] = _mm256_setzero_si256(); - c[2] = _mm256_setzero_si256(); - c[3] = _mm256_setzero_si256(); - } + //} else { + // c[0] = _mm256_setzero_si256(); + // c[1] = _mm256_setzero_si256(); + // c[2] = _mm256_setzero_si256(); + // c[3] = _mm256_setzero_si256(); + //} if (K - k == 3) { madd_epi16x3_packed<SUM_A>( diff --git a/src/FbgemmI8DepthwisePerChannelQuantAvx2.cc b/src/FbgemmI8DepthwisePerChannelQuantAvx2.cc index ec26287..883558b 100644 --- a/src/FbgemmI8DepthwisePerChannelQuantAvx2.cc +++ b/src/FbgemmI8DepthwisePerChannelQuantAvx2.cc @@ -4,7 +4,7 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ -#define FBGEMM_EXPORTS +//#define FBGEMM_EXPORTS #include "fbgemm/FbgemmI8DepthwiseAvx2.h" #include <stdexcept> // for logic_error diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index c271c4c..dc8ce5c 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -239,6 +239,54 @@ PackBMatrix<T, accT>::PackBMatrix( } template <typename T, typename accT> +PackBMatrix<T, accT>::PackBMatrix(matrix_op_t trans, + int32_t nRow, + int32_t nCol, + inpType* prepackedmat, + int32_t ld, + int groups, + const BlockingFactors* params) + : PackMatrix<PackBMatrix<T, accT>, T, accT>(nRow, nCol, prepackedmat, groups, params), + trans_(trans), + smat_(nullptr), + ld_(ld) { + if(!cpuinfo_initialize()) { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } + if(params) { + if(fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) { + BaseType::brow_ = params->KCB; + BaseType::bcol_ = params->NCB; + row_interleave_ = params->ROW_INTERLEAVE; + } else { + // TODO: Have default slower path + assert(0 && "unsupported architecure"); + } + } else { + if(fbgemmHasAvx512Support()) { + BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; + BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB; + row_interleave_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; + } else if(fbgemmHasAvx2Support()) { + BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; + BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::NCB; + row_interleave_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE; + } else { + // Error + assert(0 && "unknown architecure"); + } + } + if(BaseType::numRows() % groups != 0) { + throw std::runtime_error("groups = " + std::to_string(groups) + + " does not divide numRows = " + std::to_string(BaseType::numRows())); + } + + // blocking for one group + block_type_t block{0, BaseType::numRows() / BaseType::numGroups(), 0, BaseType::numCols()}; + BaseType::packedBlock(block); +} + +template <typename T, typename accT> void PackBMatrix<T, accT>::pack_unpack_( const block_type_t& block, T* unpack_buf, diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc index 8e36855..a1e6858 100644 --- a/src/QuantUtilsAvx2.cc +++ b/src/QuantUtilsAvx2.cc @@ -163,7 +163,8 @@ void FindMinMax(const float* a, float* min, float* max, int len) { //////////////////////////////////////////////////////////////////////////////// // Requantization (with floats) -#ifdef __AVX2__ +//#ifdef __AVX2__ +#if 1 void RequantizeAvx2( const int32_t* src, uint8_t* dst, diff --git a/src/TransposeUtilsAvx2.h b/src/TransposeUtilsAvx2.h index a405745..9925eb6 100644 --- a/src/TransposeUtilsAvx2.h +++ b/src/TransposeUtilsAvx2.h @@ -13,7 +13,8 @@ namespace fbgemm { namespace internal { -#ifdef __AVX2__ +//#ifdef __AVX2__ +#if 1 // NOTE: Make sure every function defined in here has static linkage because // this header file is included by UtilsAvx512.cc compiled with -mavx512f option |