Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/google/ruy.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBenoit Jacob <benoitjacob@google.com>2020-07-21 21:43:09 +0300
committerCopybara-Service <copybara-worker@google.com>2020-07-21 21:43:31 +0300
commit550655fc32d1ad7dea6e1555099696ff6e767aab (patch)
tree102720a15067d89d1b4d0658ff269ef0712f023c /ruy/kernel_avx512.cc
parentec99c704a19d38ea502e81c0a9f5b82026471cef (diff)
Use lambdas to shorten Kernel8bitAvx512's source code, and to split the resulting non-opt binary code into smaller functions. This makes no difference in opt builds, but for non-opt builds this reduces the stack frame of this function from 60k down to 24k. This avoids stack overflows in some toolchains.
PiperOrigin-RevId: 322406964
Diffstat (limited to 'ruy/kernel_avx512.cc')
-rw-r--r--ruy/kernel_avx512.cc683
1 files changed, 54 insertions, 629 deletions
diff --git a/ruy/kernel_avx512.cc b/ruy/kernel_avx512.cc
index 72506f7..3d36516 100644
--- a/ruy/kernel_avx512.cc
+++ b/ruy/kernel_avx512.cc
@@ -358,246 +358,33 @@ void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params) {
const __m512i lhs_16_bit_high = _mm512_cvtepi8_epi16(
_mm512_cvtepi32_epi16(_mm512_srli_epi32(lhs_data, 16)));
- // Process column 0.
- {
- __m512i accum_v = accum_data_v0;
- constexpr int index = 0;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_v0 = accum_v;
- }
- // Process column 1.
- {
- __m512i accum_v = accum_data_v1;
- constexpr int index = 2;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_v1 = accum_v;
- }
- // Process column 2.
- {
- __m512i accum_v = accum_data_v2;
- constexpr int index = 4;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_v2 = accum_v;
- }
- // Process column 3.
- {
- __m512i accum_v = accum_data_v3;
- constexpr int index = 6;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_v3 = accum_v;
- }
- // Process column 4.
- {
- __m512i accum_v = accum_data_v4;
- constexpr int index = 8;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_v4 = accum_v;
- }
- // Process column 5.
- {
- __m512i accum_v = accum_data_v5;
- constexpr int index = 10;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_v5 = accum_v;
- }
- // Process column 6.
- {
- __m512i accum_v = accum_data_v6;
- constexpr int index = 12;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_v6 = accum_v;
- }
- // Process column 7.
- {
- __m512i accum_v = accum_data_v7;
- constexpr int index = 14;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_v7 = accum_v;
- }
- // Process column 8.
- {
- __m512i accum_v = accum_data_v8;
- constexpr int index = 16;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_v8 = accum_v;
- }
- // Process column 9.
- {
- __m512i accum_v = accum_data_v9;
- constexpr int index = 18;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_v9 = accum_v;
- }
- // Process column 10.
- {
- __m512i accum_v = accum_data_va;
- constexpr int index = 20;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_va = accum_v;
- }
- // Process column 11.
- {
- __m512i accum_v = accum_data_vb;
- constexpr int index = 22;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_vb = accum_v;
- }
- // Process column 12.
- {
- __m512i accum_v = accum_data_vc;
- constexpr int index = 24;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
+ auto process_column = [=](int col, __m512i& accum) {
+ const __m512i rhs_16_bit_dup_low =
+ _mm512_set1_epi32(rhs_data[2 * col]);
const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_vc = accum_v;
- }
- // Process column 13.
- {
- __m512i accum_v = accum_data_vd;
- constexpr int index = 26;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_vd = accum_v;
- }
- // Process column 14.
- {
- __m512i accum_v = accum_data_ve;
- constexpr int index = 28;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_ve = accum_v;
- }
- // Process column 15.
- {
- __m512i accum_v = accum_data_vf;
- constexpr int index = 30;
-
- const __m512i rhs_16_bit_dup_low = _mm512_set1_epi32(rhs_data[index]);
- const __m512i rhs_16_bit_dup_high =
- _mm512_set1_epi32(rhs_data[index + 1]);
-
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
- accum_v = _mm512_add_epi32(
- accum_v, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
- accum_data_vf = accum_v;
- }
+ _mm512_set1_epi32(rhs_data[2 * col + 1]);
+
+ accum = _mm512_add_epi32(
+ accum, _mm512_madd_epi16(lhs_16_bit_low, rhs_16_bit_dup_low));
+ accum = _mm512_add_epi32(
+ accum, _mm512_madd_epi16(lhs_16_bit_high, rhs_16_bit_dup_high));
+ };
+ process_column(0, accum_data_v0);
+ process_column(1, accum_data_v1);
+ process_column(2, accum_data_v2);
+ process_column(3, accum_data_v3);
+ process_column(4, accum_data_v4);
+ process_column(5, accum_data_v5);
+ process_column(6, accum_data_v6);
+ process_column(7, accum_data_v7);
+ process_column(8, accum_data_v8);
+ process_column(9, accum_data_v9);
+ process_column(10, accum_data_va);
+ process_column(11, accum_data_vb);
+ process_column(12, accum_data_vc);
+ process_column(13, accum_data_vd);
+ process_column(14, accum_data_ve);
+ process_column(15, accum_data_vf);
lhs_ptr += 16 * 4;
rhs_ptr += 16 * 4;
@@ -658,18 +445,15 @@ void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params) {
&accum_data_vc, &accum_data_vd, &accum_data_ve, &accum_data_vf);
}
- // Shift and round column 0.
- {
- accum_data_v0 = _mm512_sllv_epi32(accum_data_v0, left_shift);
+ auto apply_multiplier = [=](__m512i& accum) {
+ accum = _mm512_sllv_epi32(accum, left_shift);
// Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v0, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v0, 1)),
- m_64bit_high);
+ __m512i scaled_v_low = _mm512_mul_epi32(
+ _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(accum, 0)),
+ m_64bit_low);
+ __m512i scaled_v_high = _mm512_mul_epi32(
+ _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(accum, 1)),
+ m_64bit_high);
scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
@@ -678,386 +462,27 @@ void Kernel8bitAvx512(const KernelParams8bit<16, 16>& params) {
scaled_v_high =
_mm512_srav_epi64(scaled_v_high, final_right_shift_high);
- accum_data_v0 =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_v0 = _mm512_inserti32x8(
- accum_data_v0, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 1.
- {
- accum_data_v1 = _mm512_sllv_epi32(accum_data_v1, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v1, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v1, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_v1 =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_v1 = _mm512_inserti32x8(
- accum_data_v1, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 2.
- {
- accum_data_v2 = _mm512_sllv_epi32(accum_data_v2, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v2, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v2, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_v2 =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_v2 = _mm512_inserti32x8(
- accum_data_v2, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 3.
- {
- accum_data_v3 = _mm512_sllv_epi32(accum_data_v3, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v3, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v3, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_v3 =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_v3 = _mm512_inserti32x8(
- accum_data_v3, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 4.
- {
- accum_data_v4 = _mm512_sllv_epi32(accum_data_v4, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v4, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v4, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
+ accum = _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
+ accum = _mm512_inserti32x8(accum,
+ _mm512_cvtepi64_epi32(scaled_v_high), 1);
+ };
+ apply_multiplier(accum_data_v0);
+ apply_multiplier(accum_data_v1);
+ apply_multiplier(accum_data_v2);
+ apply_multiplier(accum_data_v3);
+ apply_multiplier(accum_data_v4);
+ apply_multiplier(accum_data_v5);
+ apply_multiplier(accum_data_v6);
+ apply_multiplier(accum_data_v7);
+ apply_multiplier(accum_data_v8);
+ apply_multiplier(accum_data_v9);
+ apply_multiplier(accum_data_va);
+ apply_multiplier(accum_data_vb);
+ apply_multiplier(accum_data_vc);
+ apply_multiplier(accum_data_vd);
+ apply_multiplier(accum_data_ve);
+ apply_multiplier(accum_data_vf);
- accum_data_v4 =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_v4 = _mm512_inserti32x8(
- accum_data_v4, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 5.
- {
- accum_data_v5 = _mm512_sllv_epi32(accum_data_v5, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v5, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v5, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_v5 =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_v5 = _mm512_inserti32x8(
- accum_data_v5, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 6.
- {
- accum_data_v6 = _mm512_sllv_epi32(accum_data_v6, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v6, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v6, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_v6 =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_v6 = _mm512_inserti32x8(
- accum_data_v6, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 7.
- {
- accum_data_v7 = _mm512_sllv_epi32(accum_data_v7, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v7, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v7, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_v7 =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_v7 = _mm512_inserti32x8(
- accum_data_v7, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 8.
- {
- accum_data_v8 = _mm512_sllv_epi32(accum_data_v8, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v8, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v8, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_v8 =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_v8 = _mm512_inserti32x8(
- accum_data_v8, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 9.
- {
- accum_data_v9 = _mm512_sllv_epi32(accum_data_v9, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v9, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_v9, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_v9 =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_v9 = _mm512_inserti32x8(
- accum_data_v9, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 10.
- {
- accum_data_va = _mm512_sllv_epi32(accum_data_va, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_va, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_va, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_va =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_va = _mm512_inserti32x8(
- accum_data_va, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 11.
- {
- accum_data_vb = _mm512_sllv_epi32(accum_data_vb, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_vb, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_vb, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_vb =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_vb = _mm512_inserti32x8(
- accum_data_vb, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 12.
- {
- accum_data_vc = _mm512_sllv_epi32(accum_data_vc, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_vc, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_vc, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_vc =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_vc = _mm512_inserti32x8(
- accum_data_vc, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 13.
- {
- accum_data_vd = _mm512_sllv_epi32(accum_data_vd, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_vd, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_vd, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_vd =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_vd = _mm512_inserti32x8(
- accum_data_vd, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 14.
- {
- accum_data_ve = _mm512_sllv_epi32(accum_data_ve, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_ve, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_ve, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_ve =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_ve = _mm512_inserti32x8(
- accum_data_ve, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
- // Shift and round column 15.
- {
- accum_data_vf = _mm512_sllv_epi32(accum_data_vf, left_shift);
- // Apply the fixed-point part of the multiplier.
- __m512i scaled_v_low =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_vf, 0)),
- m_64bit_low);
- __m512i scaled_v_high =
- _mm512_mul_epi32(_mm512_cvtepi32_epi64(
- _mm512_extracti32x8_epi32(accum_data_vf, 1)),
- m_64bit_high);
-
- scaled_v_low = _mm512_add_epi64(scaled_v_low, offset_vector_low);
- scaled_v_high = _mm512_add_epi64(scaled_v_high, offset_vector_high);
-
- scaled_v_low = _mm512_srav_epi64(scaled_v_low, final_right_shift_low);
- scaled_v_high =
- _mm512_srav_epi64(scaled_v_high, final_right_shift_high);
-
- accum_data_vf =
- _mm512_castsi256_si512(_mm512_cvtepi64_epi32(scaled_v_low));
- accum_data_vf = _mm512_inserti32x8(
- accum_data_vf, _mm512_cvtepi64_epi32(scaled_v_high), 1);
- }
if (transpose_around_multiplier) {
// See above comment: here we transpose again to undo the
// transposition of the 16x16 block of accumulators used to implement