Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intgemm/multiply.h')
-rw-r--r--intgemm/multiply.h11
1 files changed, 8 insertions, 3 deletions
diff --git a/intgemm/multiply.h b/intgemm/multiply.h
index e201e09..84c0655 100644
--- a/intgemm/multiply.h
+++ b/intgemm/multiply.h
@@ -13,6 +13,7 @@ INTGEMM_SSE2 static inline dvector_t<CPUType::SSE2, int> PermuteSummer(__m128i p
return { pack0123, pack4567 };
}
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
// This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f
__m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21);
@@ -20,7 +21,7 @@ INTGEMM_AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4
__m256i blended = _mm256_blend_epi32(pack0123, pack4567, 0xf0);
return _mm256_add_epi32(rev, blended);
}
-
+#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
@@ -99,7 +100,9 @@ target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Reg
} \
INTGEMM_PACK0123(INTGEMM_SSE2, __m128i)
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_PACK0123(INTGEMM_AVX2, __m256i)
+#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i)
@@ -111,10 +114,12 @@ INTGEMM_SSE2 static inline void RunCallback(Callback& callback_impl, dvector_t<C
callback_impl(total.second, callbacks::OutputBufferInfo(row_idx, col_idx + 4, rows, cols));
}
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template <typename Callback>
INTGEMM_AVX2 static inline void RunCallback(Callback& callback_impl, vector_t<CPUType::AVX2, int> total, Index row_idx, Index col_idx, Index rows, Index cols) {
callback_impl(total, callbacks::OutputBufferInfo(row_idx, col_idx, rows, cols));
}
+#endif
// 16-bit multiplier for INTGEMM_SSE2, INTGEMM_AVX2, and AVX512.
// C = A * B * unquant_mult
@@ -374,7 +379,7 @@ template <typename Callback> target static void Multiply(const int16_t *A, const
* 256-bit. We had to wait for INTGEMM_AVX2 to get 256-bit versions of vpsignb and
* vpmaddubsw. That's why this code is generic over 128-bit or 256-bit.
*/
-
+#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_AVX2 inline static void InnerINTGEMM_AVX2(
__m256i a, const __m256i *b,
__m256i &sum0, __m256i &sum1, __m256i &sum2, __m256i &sum3,
@@ -514,7 +519,7 @@ INTGEMM_AVX2 inline static void InnerINTGEMM_AVX2(
sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a)));
#endif
}
-
+#endif
// For INTGEMM_SSSE3 without AVX
INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(