Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'multiply.h')
-rw-r--r--multiply.h66
1 files changed, 46 insertions, 20 deletions
diff --git a/multiply.h b/multiply.h
index 0aa86aa..e2aa0fb 100644
--- a/multiply.h
+++ b/multiply.h
@@ -59,20 +59,38 @@ static inline INTGEMM_AVX512F float MaxFloat32(__m512 a) {
#endif
+// Quantize function used for SSSE3 and AVX2.
+#define INTGEMM_QUANTIZE(target, Register, name) \
+target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \
+ assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
+ assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
+ name::QuantizeTile8 q(quant_mult); \
+ const std::size_t kBatch = sizeof(Register); \
+ const std::size_t fast_end = size & ~(kBatch - 1); \
+ _Pragma("omp parallel for") \
+ for (std::size_t i = 0; i < fast_end; i += kBatch) { \
+ *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \
+ } \
+ std::size_t overhang = size & (kBatch - 1); \
+ if (!overhang) return; \
+ /* Each does size(Register) / 32 == kBatch / 4 floats at a time.
+ * If we're allowed to read one of them, then we can read the whole register. */ \
+ const float *inputs[4]; \
+ std::size_t i; \
+ for (i = 0; i < (overhang + (kBatch / 4) - 1) / (kBatch / 4); ++i) { \
+ inputs[i] = &input[fast_end + i * (kBatch / 4)]; \
+ } \
+ /* These will be clipped off. */ \
+ for (; i < 4; ++i) { \
+ inputs[i] = &input[fast_end]; \
+ } \
+ Register result = q.Tile(inputs[0], inputs[1], inputs[2], inputs[3]); \
+ std::memcpy(output, &result, overhang); \
+}
+
/* Take 4 registers with 32-bit values to be horizontally added. Reduce them
* to one register with 32-bit values in the pattern 1 2 3 4 1 2 3 4, leaving
* the final addition (which crosses 128-bit lanes) to the caller.
-template <class Register> inline Register Pack0123(Register sum0, Register sum1, Register sum2, Register sum3) {
- // 1 2 1 2 1 2 1 2
- Interleave32(sum0, sum1);
- Register pack01 = add_epi32(sum0, sum1);
- // 3 4 3 4 3 4 3 4
- Interleave32(sum2, sum3);
- Register pack23 = add_epi32(sum2, sum3);
- Interleave64(pack01, pack23);
- // 1 2 3 4 1 2 3 4
- return add_epi32(pack01, pack23);
-}
*/
#define INTGEMM_PACK0123(target, Register) \
target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Register sum3) { \
@@ -562,20 +580,28 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
} \
#define INTGEMM_MAXABSOLUTE(Register, target) \
-target static float MaxAbsolute(const float *begin_float, const float *end_float) { \
+target static inline float MaxAbsolute(const float *begin_float, const float *end_float) { \
assert(end_float > begin_float); \
- assert((end_float - begin_float) % (sizeof(Register) / sizeof(float)) == 0); \
+ assert(reinterpret_cast<uintptr_t>(begin_float) % sizeof(Register) == 0); \
const Register *begin = reinterpret_cast<const Register*>(begin_float); \
- const Register *end = reinterpret_cast<const Register*>(end_float); \
- union {float f; int32_t i;} float_convert; \
- float_convert.i = 0x7fffffff; \
- Register and_me = set1_ps<Register>(float_convert.f); \
- Register highest = and_ps(and_me, *begin); \
- for (++begin; begin != end; ++begin) { \
+ const float *end_reg = end_float - (reinterpret_cast<uintptr_t>(end_float) % sizeof(Register)) / sizeof(float); \
+ const Register *end = reinterpret_cast<const Register*>(end_reg); \
+ union {float f; int32_t i;} and_convert, float_convert; \
+ and_convert.i = 0x7fffffff; \
+ Register and_me = set1_ps<Register>(and_convert.f); \
+ Register highest = setzero_ps<Register>(); \
+ for (; begin < end; ++begin) { \
Register reg = and_ps(and_me, *begin); \
highest = max_ps(highest, reg); \
} \
- return MaxFloat32(highest); \
+ float ret = MaxFloat32(highest); \
+ /* Overhang: this would be more efficient if done in a single SIMD operation with some zeroing */ \
+ for (const float *i = end_reg; i < end_float; ++i) { \
+ float_convert.f = *i; \
+ float_convert.i &= and_convert.i; \
+ ret = std::max(ret, float_convert.f); \
+ } \
+ return ret; \
} \
} // namespace intgemm