diff options
author | Kenneth Heafield <github@kheafield.com> | 2020-04-22 02:03:21 +0300 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2020-04-22 02:03:21 +0300 |
commit | 6cf421955cb88e887ad6e74b6f4ac048c7e3b563 (patch) | |
tree | e03cd9ea4a566c35770a2f2b4dfc3744758b0d71 | |
parent | be4e5b24fb5e4143420033f663c3910729551f77 (diff) | |
parent | 20954a69a895f20d5fa583dedeb41e657e615338 (diff) |
Merge remote-tracking branch 'origin/master' into static
-rw-r--r-- | avx2_gemm.h | 2 | ||||
-rw-r--r-- | avx512_gemm.h | 2 | ||||
-rw-r--r-- | avx512vnni_gemm.h | 57 | ||||
-rw-r--r-- | intgemm.cc | 2 | ||||
-rw-r--r-- | intgemm.h | 10 | ||||
-rw-r--r-- | intrinsics.h | 9 | ||||
-rw-r--r-- | multiply.h | 25 | ||||
-rw-r--r-- | sse2_gemm.h | 2 | ||||
-rw-r--r-- | test/quantize_test.cc | 98 |
9 files changed, 123 insertions, 84 deletions
diff --git a/avx2_gemm.h b/avx2_gemm.h index c1c4616..68eb37e 100644 --- a/avx2_gemm.h +++ b/avx2_gemm.h @@ -192,7 +192,7 @@ class QuantizeTile8 { // Technically only requires AVX INTGEMM_MAXABSOLUTE(__m256, INTGEMM_AVX2) -INTGEMM_GETQUANTIZERSTD(__m256, INTGEMM_AVX2) +INTGEMM_VECTORMEANSTD(__m256, INTGEMM_AVX2) } // namespace diff --git a/avx512_gemm.h b/avx512_gemm.h index c6a473e..a0087b3 100644 --- a/avx512_gemm.h +++ b/avx512_gemm.h @@ -159,7 +159,7 @@ class QuantizeTile8 { /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */ INTGEMM_MAXABSOLUTE(__m512, INTGEMM_AVX512BW) -INTGEMM_GETQUANTIZERSTD(__m512, INTGEMM_AVX512BW) +INTGEMM_VECTORMEANSTD(__m512, INTGEMM_AVX512BW) } // namespace diff --git a/avx512vnni_gemm.h b/avx512vnni_gemm.h index 6eb3be4..22c5c4e 100644 --- a/avx512vnni_gemm.h +++ b/avx512vnni_gemm.h @@ -8,6 +8,15 @@ namespace intgemm { +// Workaround extra vmovdqa64 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663 +INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) { +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) + asm ("vpdpbusds %2, %1, %0" : "+x"(c) : "x"(a), "mx"(b)); +#else + c = _mm512_dpbusds_epi32(c, a, b); +#endif +} + struct AVX512VNNI_8bit : public AVX512_8bit { template <typename Callback> INTGEMM_AVX512VNNI static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { @@ -54,14 +63,14 @@ struct AVX512VNNI_8bit : public AVX512_8bit { b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5); b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6); b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7); - sum0 = _mm512_dpbusds_epi32(sum0, a_positive, b0); - sum1 = _mm512_dpbusds_epi32(sum1, a_positive, b1); - sum2 = _mm512_dpbusds_epi32(sum2, a_positive, b2); - sum3 = _mm512_dpbusds_epi32(sum3, a_positive, b3); - sum4 = _mm512_dpbusds_epi32(sum4, a_positive, b4); - sum5 = _mm512_dpbusds_epi32(sum5, a_positive, b5); - sum6 = _mm512_dpbusds_epi32(sum6, a_positive, b6); - sum7 = _mm512_dpbusds_epi32(sum7, a_positive, b7); + VNNI8(sum0, a_positive, b0); + VNNI8(sum1, a_positive, b1); + VNNI8(sum2, a_positive, b2); + VNNI8(sum3, a_positive, b3); + VNNI8(sum4, a_positive, b4); + VNNI8(sum5, a_positive, b5); + VNNI8(sum6, a_positive, b6); + VNNI8(sum7, a_positive, b7); } Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); @@ -96,14 +105,14 @@ struct AVX512VNNI_8bit : public AVX512_8bit { for (; A_live != A_end; ++A_live, B_live += 8) { Register a = *A_live; //MultiplyAdd - sum0 = _mm512_dpbusds_epi32(sum0, a, *B_live); - sum1 = _mm512_dpbusds_epi32(sum1, a, *(B_live + 1)); - sum2 = _mm512_dpbusds_epi32(sum2, a, *(B_live + 2)); - sum3 = _mm512_dpbusds_epi32(sum3, a, *(B_live + 3)); - sum4 = _mm512_dpbusds_epi32(sum4, a, *(B_live + 4)); - sum5 = _mm512_dpbusds_epi32(sum5, a, *(B_live + 5)); - sum6 = _mm512_dpbusds_epi32(sum6, a, *(B_live + 6)); - sum7 = _mm512_dpbusds_epi32(sum7, a, *(B_live + 7)); + VNNI8(sum0, a, *B_live); + VNNI8(sum1, a, *(B_live + 1)); + VNNI8(sum2, a, *(B_live + 2)); + VNNI8(sum3, a, *(B_live + 3)); + VNNI8(sum4, a, *(B_live + 4)); + VNNI8(sum5, a, *(B_live + 5)); + VNNI8(sum6, a, *(B_live + 6)); + VNNI8(sum7, a, *(B_live + 7)); } Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); @@ -134,14 +143,14 @@ struct AVX512VNNI_8bit : public AVX512_8bit { Register sum0 = zeros, sum1 = zeros, sum2 = zeros, sum3 = zeros, sum4 = zeros, sum5 = zeros, sum6 = zeros, sum7 = zeros; for (; B_live != B_end; B_live += 8) { // Retrieve the conveniently consecutive values of B. - sum0 = _mm512_dpbusds_epi32(sum0, a, *B_live); - sum1 = _mm512_dpbusds_epi32(sum1, a, *(B_live + 1)); - sum2 = _mm512_dpbusds_epi32(sum2, a, *(B_live + 2)); - sum3 = _mm512_dpbusds_epi32(sum3, a, *(B_live + 3)); - sum4 = _mm512_dpbusds_epi32(sum4, a, *(B_live + 4)); - sum5 = _mm512_dpbusds_epi32(sum5, a, *(B_live + 5)); - sum6 = _mm512_dpbusds_epi32(sum6, a, *(B_live + 6)); - sum7 = _mm512_dpbusds_epi32(sum7, a, *(B_live + 7)); + VNNI8(sum0, a, *B_live); + VNNI8(sum1, a, *(B_live + 1)); + VNNI8(sum2, a, *(B_live + 2)); + VNNI8(sum3, a, *(B_live + 3)); + VNNI8(sum4, a, *(B_live + 4)); + VNNI8(sum5, a, *(B_live + 5)); + VNNI8(sum6, a, *(B_live + 6)); + VNNI8(sum7, a, *(B_live + 7)); } Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); @@ -40,7 +40,7 @@ const CPUType kCPU = ChooseCPU(CPUType::AVX512VNNI, CPUType::AVX512BW, CPUType:: float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(avx512f::MaxAbsolute, avx512f::MaxAbsolute, avx2::MaxAbsolute, sse2::MaxAbsolute, sse2::MaxAbsolute, Unsupported_MaxAbsolute); -MeanStd (*GetQuantizerStd)(const float *begin, const float *end) = ChooseCPU(avx512f::GetQuantizerStd, avx512f::GetQuantizerStd, avx2::GetQuantizerStd, sse2::GetQuantizerStd, sse2::GetQuantizerStd, sse2::GetQuantizerStd); +MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(avx512f::VectorMeanStd, avx512f::VectorMeanStd, avx2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd, sse2::VectorMeanStd); constexpr const char *const Unsupported_16bit::kName; constexpr const char *const Unsupported_8bit::kName; @@ -128,7 +128,7 @@ namespace avx512f { static inline float MaxAbsolute(const float * /*begin*/, const float * /*end*/) { throw UnsupportedCPU(); } -static inline MeanStd MaxAbsolute(const float * /*begin*/, const float * /*end*/) { +static inline MeanStd EuclideanNorm(const float * /*begin*/, const float * /*end*/, bool) { throw UnsupportedCPU(); } } //namespace @@ -420,7 +420,13 @@ extern const CPUType kCPU; extern float (*MaxAbsolute)(const float *begin, const float *end); // Get a Quantization value that is equant to the mean of the data +N standard deviations. Use 2 by default -extern MeanStd (*GetQuantizerStd)(const float *begin, const float *end); +extern MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool); + +/* Returns the Mean and the Standard deviation of a vector. + * If "absolute" is set to true, it computes the mean and the standard deviation of the absolute values of the vector */ +static inline MeanStd GetVectorMeanStd(const float * begin, const float * end, bool absolute=false) { + return VectorMeanStd(begin, end, absolute); +} } // namespace intgemm diff --git a/intrinsics.h b/intrinsics.h index 98b0961..f733457 100644 --- a/intrinsics.h +++ b/intrinsics.h @@ -57,6 +57,9 @@ INTGEMM_SSE2 static inline __m128 add_ps(__m128 a, __m128 b) { INTGEMM_SSE2 static inline __m128 and_ps(__m128 first, __m128 second) { return _mm_and_ps(first, second); } +INTGEMM_SSE2 static inline __m128 andnot_ps(__m128 a, __m128 b) { + return _mm_andnot_ps(a, b); +} INTGEMM_SSE2 static inline __m128i and_si(__m128i a, __m128i b) { return _mm_and_si128(a, b); } @@ -234,6 +237,9 @@ INTGEMM_AVX2 static inline __m256 add_ps(__m256 a, __m256 b) { INTGEMM_AVX2 static inline __m256 and_ps(__m256 first, __m256 second) { return _mm256_and_ps(first, second); } +INTGEMM_AVX2 static inline __m256 andnot_ps(__m256 a, __m256 b) { + return _mm256_andnot_ps(a, b); +} INTGEMM_AVX2 static inline __m256i and_si(__m256i a, __m256i b) { return _mm256_and_si256(a, b); } @@ -414,6 +420,9 @@ INTGEMM_AVX512BW static inline __m512 add_ps(__m512 a, __m512 b) { INTGEMM_AVX512DQ static inline __m512 and_ps(__m512 first, __m512 second) { return _mm512_and_ps(first, second); } +INTGEMM_AVX512DQ static inline __m512 andnot_ps(__m512 a, __m512 b) { + return _mm512_andnot_ps(a, b); +} INTGEMM_AVX512BW static inline __m512i and_si(__m512i a, __m512i b) { return _mm512_and_si512(a, b); } @@ -635,7 +635,7 @@ template <class Callback, class Backend, class Integer = typename Backend::Integ #pragma omp parallel Backend::template Multiply<Callback>(A, B, A_rows, width, B_cols, callback); } -template <class Callback, class Backend> static inline void OMPParallelWrap8Shift(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { +template <class Callback, class Backend> static inline void OMPParallelWrap8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { #pragma omp parallel Backend::template Multiply8Shift<Callback>(A, B, A_rows, width, B_cols, callback); } @@ -665,9 +665,9 @@ target static inline float MaxAbsolute(const float *begin_float, const float *en return ret; \ } \ -#define INTGEMM_GETQUANTIZERSTD(Register, target) \ -target static inline MeanStd GetQuantizerStd(const float *begin_float, const float *end_float) { \ - /* Finds a quantizer value that is a certain number of standard deviations of the mean */ \ +#define INTGEMM_VECTORMEANSTD(Register, target) \ +target static inline MeanStd VectorMeanStd(const float *begin_float, const float *end_float, bool absolute) { \ + /* Computes the euclidean norm and returns the mean and the standard deviation. Optionally it can be the mean and standard deviation in absolute terms. */ \ assert(end_float > begin_float); \ assert((end_float - begin_float) % (sizeof(Register) / sizeof(float)) == 0); \ size_t num_items = end_float - begin_float; \ @@ -675,9 +675,20 @@ target static inline MeanStd GetQuantizerStd(const float *begin_float, const flo const Register *end = reinterpret_cast<const Register*>(end_float); \ Register squares = set1_ps<Register>(0); \ Register sums = set1_ps<Register>(0); \ - for (; begin != end; begin++) { \ - squares = add_ps(squares, mul_ps(*begin, *begin)); \ - sums = add_ps(sums, *begin); \ + if (absolute) { \ + const Register mask = set1_ps<Register>(-0.f); \ + for (; begin != end; begin++) { \ + Register vec = *begin; \ + vec = andnot_ps(mask, vec); \ + squares = add_ps(squares, mul_ps(vec, vec)); \ + sums = add_ps(sums, vec); \ + } \ + } else { \ + for (; begin != end; begin++) { \ + Register vec = *begin; \ + squares = add_ps(squares, mul_ps(vec, vec)); \ + sums = add_ps(sums, vec); \ + } \ } \ float squares_sum = horizontalSum(squares); \ float normal_sums = horizontalSum(sums); \ diff --git a/sse2_gemm.h b/sse2_gemm.h index 91221d9..8b8f1c2 100644 --- a/sse2_gemm.h +++ b/sse2_gemm.h @@ -53,7 +53,7 @@ class QuantizeTile16 { INTGEMM_MAXABSOLUTE(__m128, INTGEMM_SSE2) -INTGEMM_GETQUANTIZERSTD(__m128, INTGEMM_SSE2) +INTGEMM_VECTORMEANSTD(__m128, INTGEMM_SSE2) } //namespace // This should be pure INTGEMM_SSE2 (and below). diff --git a/test/quantize_test.cc b/test/quantize_test.cc index 4e3d424..d2046f6 100644 --- a/test/quantize_test.cc +++ b/test/quantize_test.cc @@ -30,10 +30,14 @@ void QuantizeRef(const float *input, int8_t *output, float quant_mult, std::size } } -MeanStd QuantizerStddRef(AlignedVector<float>& vals, int num_items) { +MeanStd VectorMeanStd(AlignedVector<float>& vals, int num_items, bool absolute) { float normal_sums = 0; float squares_sum = 0; - std::for_each(vals.begin(), vals.end(), [&] (float n) {normal_sums+=n;}); + if (absolute) { + std::for_each(vals.begin(), vals.end(), [&] (float n) {normal_sums+=abs(n);}); + } else { + std::for_each(vals.begin(), vals.end(), [&] (float n) {normal_sums+=n;}); + } std::for_each(vals.begin(), vals.end(), [&] (float n) {squares_sum+=n*n;}); MeanStd ret; @@ -42,8 +46,8 @@ MeanStd QuantizerStddRef(AlignedVector<float>& vals, int num_items) { return ret; } -template <MeanStd (*Backend) (const float *, const float *)> -void testQuantizerStd(int num_items) { +template <MeanStd (*Backend) (const float *, const float *, bool)> +void testVectorMeanStd(int num_items, bool absolute=false) { std::mt19937 gen; std::uniform_real_distribution<float> dist(-1.0f, 1.0f); AlignedVector<float> inputVec(num_items); @@ -52,15 +56,15 @@ void testQuantizerStd(int num_items) { it = dist(gen); } - MeanStd reference = QuantizerStddRef(inputVec, num_items); - MeanStd fast = Backend(inputVec.begin(), inputVec.end()); + MeanStd reference = VectorMeanStd(inputVec, num_items, absolute); + MeanStd fast = Backend(inputVec.begin(), inputVec.end(), absolute); float meanDifference = fabs(reference.mean - fast.mean); float stdDifference = fabs(reference.stddev - fast.stddev); float eps = 0.00002; //Accumulating horizontal sums can lead to errors. - CHECK_MESSAGE(meanDifference <= eps, "Reference mean: " << reference.mean << " actual: " << fast.mean);// /*Backend::kName << */" Mismatch:\n" << "Reference: " << reference << " Fast: " << fast << std::endl); - CHECK_MESSAGE(stdDifference <= eps, "Reference stddev: " << reference.stddev << " actual: " << fast.stddev); + CHECK_MESSAGE(meanDifference <= eps, "Items: " << num_items << " Absolute: " << absolute << " Reference mean: " << reference.mean << " actual: " << fast.mean); + CHECK_MESSAGE(stdDifference <= eps, "Items: " << num_items << " Absolute: " << absolute << " Reference mean: " << reference.stddev << " actual: " << fast.stddev); } @@ -128,53 +132,53 @@ TEST_CASE ("Quantize AVX2", "[quantize]") { } #endif -TEST_CASE("QuantizeStd SSSE3", "[quantizerSTD]") { +TEST_CASE("QuantizeStd SSSE3", "[VectorMeanStd]") { if (kCPU < CPUType::SSSE3) return; - testQuantizerStd<sse2::GetQuantizerStd>(64); - testQuantizerStd<sse2::GetQuantizerStd>(64); - testQuantizerStd<sse2::GetQuantizerStd>(256); - testQuantizerStd<sse2::GetQuantizerStd>(256); - testQuantizerStd<sse2::GetQuantizerStd>(2048); - testQuantizerStd<sse2::GetQuantizerStd>(2048); - testQuantizerStd<sse2::GetQuantizerStd>(65536); - testQuantizerStd<sse2::GetQuantizerStd>(65536); - testQuantizerStd<sse2::GetQuantizerStd>(81920); - testQuantizerStd<sse2::GetQuantizerStd>(81920); - testQuantizerStd<sse2::GetQuantizerStd>(120832); - testQuantizerStd<sse2::GetQuantizerStd>(120832); + testVectorMeanStd<sse2::VectorMeanStd>(64); + testVectorMeanStd<sse2::VectorMeanStd>(64, true); + testVectorMeanStd<sse2::VectorMeanStd>(256); + testVectorMeanStd<sse2::VectorMeanStd>(256, true); + testVectorMeanStd<sse2::VectorMeanStd>(2048); + testVectorMeanStd<sse2::VectorMeanStd>(2048, true); + testVectorMeanStd<sse2::VectorMeanStd>(65536); + testVectorMeanStd<sse2::VectorMeanStd>(65536, true); + testVectorMeanStd<sse2::VectorMeanStd>(81920); + testVectorMeanStd<sse2::VectorMeanStd>(81920, true); + testVectorMeanStd<sse2::VectorMeanStd>(120832); + testVectorMeanStd<sse2::VectorMeanStd>(120832, true); } -TEST_CASE("QuantizeStd AVX2", "[quantizerSTD]") { +TEST_CASE("QuantizeStd AVX2", "[VectorMeanStd]") { if (kCPU < CPUType::AVX2) return; - testQuantizerStd<avx2::GetQuantizerStd>(64); - testQuantizerStd<avx2::GetQuantizerStd>(64); - testQuantizerStd<avx2::GetQuantizerStd>(256); - testQuantizerStd<avx2::GetQuantizerStd>(256); - testQuantizerStd<avx2::GetQuantizerStd>(2048); - testQuantizerStd<avx2::GetQuantizerStd>(2048); - testQuantizerStd<avx2::GetQuantizerStd>(65536); - testQuantizerStd<avx2::GetQuantizerStd>(65536); - testQuantizerStd<avx2::GetQuantizerStd>(81920); - testQuantizerStd<avx2::GetQuantizerStd>(81920); - testQuantizerStd<avx2::GetQuantizerStd>(120832); - testQuantizerStd<avx2::GetQuantizerStd>(120832); + testVectorMeanStd<avx2::VectorMeanStd>(64); + testVectorMeanStd<avx2::VectorMeanStd>(64, true); + testVectorMeanStd<avx2::VectorMeanStd>(256); + testVectorMeanStd<avx2::VectorMeanStd>(256, true); + testVectorMeanStd<avx2::VectorMeanStd>(2048); + testVectorMeanStd<avx2::VectorMeanStd>(2048, true); + testVectorMeanStd<avx2::VectorMeanStd>(65536); + testVectorMeanStd<avx2::VectorMeanStd>(65536, true); + testVectorMeanStd<avx2::VectorMeanStd>(81920); + testVectorMeanStd<avx2::VectorMeanStd>(81920, true); + testVectorMeanStd<avx2::VectorMeanStd>(120832); + testVectorMeanStd<avx2::VectorMeanStd>(120832, true); } #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW -TEST_CASE("QuantizeStd AVX512", "[quantizerSTD]") { +TEST_CASE("QuantizeStd AVX512", "[VectorMeanStd]") { if (kCPU < CPUType::AVX512BW) return; - testQuantizerStd<avx512f::GetQuantizerStd>(64); - testQuantizerStd<avx512f::GetQuantizerStd>(64); - testQuantizerStd<avx512f::GetQuantizerStd>(256); - testQuantizerStd<avx512f::GetQuantizerStd>(256); - testQuantizerStd<avx512f::GetQuantizerStd>(2048); - testQuantizerStd<avx512f::GetQuantizerStd>(2048); - testQuantizerStd<avx512f::GetQuantizerStd>(65536); - testQuantizerStd<avx512f::GetQuantizerStd>(65536); - testQuantizerStd<avx512f::GetQuantizerStd>(81920); - testQuantizerStd<avx512f::GetQuantizerStd>(81920); - testQuantizerStd<avx512f::GetQuantizerStd>(120832); - testQuantizerStd<avx512f::GetQuantizerStd>(120832); + testVectorMeanStd<avx512f::VectorMeanStd>(64); + testVectorMeanStd<avx512f::VectorMeanStd>(64, true); + testVectorMeanStd<avx512f::VectorMeanStd>(256); + testVectorMeanStd<avx512f::VectorMeanStd>(256, true); + testVectorMeanStd<avx512f::VectorMeanStd>(2048); + testVectorMeanStd<avx512f::VectorMeanStd>(2048, true); + testVectorMeanStd<avx512f::VectorMeanStd>(65536); + testVectorMeanStd<avx512f::VectorMeanStd>(65536, true); + testVectorMeanStd<avx512f::VectorMeanStd>(81920); + testVectorMeanStd<avx512f::VectorMeanStd>(81920, true); + testVectorMeanStd<avx512f::VectorMeanStd>(120832); + testVectorMeanStd<avx512f::VectorMeanStd>(120832, true); } #endif |