Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/google/ruy.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorT.J. Alumbaugh <talumbau@google.com>2020-09-26 23:57:09 +0300
committerCopybara-Service <copybara-worker@google.com>2020-09-26 23:57:30 +0300
commit14569d28154b3f999cd8f4c29b8d6627874682bb (patch)
treea3f0c97e85ad822cbb049a60be6cc064bb6645fa
parentfad2140a711369b19c2655d67f1588f97b583dec (diff)
Additional optimizations for AVX 8bit quantized kernel.
PiperOrigin-RevId: 333938400
-rw-r--r--ruy/kernel_avx.cc30
1 files changed, 24 insertions, 6 deletions
diff --git a/ruy/kernel_avx.cc b/ruy/kernel_avx.cc
index f6fc472..21e8826 100644
--- a/ruy/kernel_avx.cc
+++ b/ruy/kernel_avx.cc
@@ -655,12 +655,30 @@ void Kernel8bitAvxImpl(const KernelParams8bit<8, 8>& params) {
auto process_column = [=](__m256i& rhs_dup_lo, __m256i& rhs_dup_hi,
__m256i& accum) {
- accum = intrin_utils::mm256_add_epi32<path>(
- accum,
- intrin_utils::mm256_madd_epi16<path>(lhs_16_bit_low, rhs_dup_lo));
- accum = intrin_utils::mm256_add_epi32<path>(
- accum, intrin_utils::mm256_madd_epi16<path>(lhs_16_bit_high,
- rhs_dup_hi));
+ // Perform mul-adds on low and high components of accum separately.
+ __m128i accum_lo = _mm256_extractf128_si256(accum, 0);
+ __m128i accum_hi = _mm256_extractf128_si256(accum, 1);
+
+ __m128i lhs_lo_0 = _mm256_extractf128_si256(lhs_16_bit_low, 0);
+ __m128i lhs_lo_1 = _mm256_extractf128_si256(lhs_16_bit_low, 1);
+ __m128i rhs_dup_lo_0 = _mm256_extractf128_si256(rhs_dup_lo, 0);
+ __m128i rhs_dup_lo_1 = _mm256_extractf128_si256(rhs_dup_lo, 1);
+ __m128i lo_0 = _mm_madd_epi16(lhs_lo_0, rhs_dup_lo_0);
+ __m128i lo_1 = _mm_madd_epi16(lhs_lo_1, rhs_dup_lo_1);
+
+ accum_lo = _mm_add_epi32(accum_lo, lo_0);
+ accum_hi = _mm_add_epi32(accum_hi, lo_1);
+
+ __m128i lhs_hi_0 = _mm256_extractf128_si256(lhs_16_bit_high, 0);
+ __m128i lhs_hi_1 = _mm256_extractf128_si256(lhs_16_bit_high, 1);
+ __m128i rhs_dup_hi_0 = _mm256_extractf128_si256(rhs_dup_hi, 0);
+ __m128i rhs_dup_hi_1 = _mm256_extractf128_si256(rhs_dup_hi, 1);
+ __m128i hi_0 = _mm_madd_epi16(lhs_hi_0, rhs_dup_hi_0);
+ __m128i hi_1 = _mm_madd_epi16(lhs_hi_1, rhs_dup_hi_1);
+
+ accum_lo = _mm_add_epi32(accum_lo, hi_0);
+ accum_hi = _mm_add_epi32(accum_hi, hi_1);
+ accum = _mm256_set_m128i(accum_hi, accum_lo);
};
__m256i tmp0, tmp1, tmp2, tmp3;
__m128i lo0, lo1, hi0, hi1;