diff options
Diffstat (limited to 'multiply.h')
-rw-r--r-- | multiply.h | 66 |
1 files changed, 46 insertions, 20 deletions
@@ -59,20 +59,38 @@ static inline INTGEMM_AVX512F float MaxFloat32(__m512 a) { #endif +// Quantize function used for SSSE3 and AVX2. +#define INTGEMM_QUANTIZE(target, Register, name) \ +target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \ + assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ + assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \ + name::QuantizeTile8 q(quant_mult); \ + const std::size_t kBatch = sizeof(Register); \ + const std::size_t fast_end = size & ~(kBatch - 1); \ + _Pragma("omp parallel for") \ + for (std::size_t i = 0; i < fast_end; i += kBatch) { \ + *reinterpret_cast<Register*>(output + i) = q.Consecutive(input + i); \ + } \ + std::size_t overhang = size & (kBatch - 1); \ + if (!overhang) return; \ + /* Each does size(Register) / 32 == kBatch / 4 floats at a time. + * If we're allowed to read one of them, then we can read the whole register. */ \ + const float *inputs[4]; \ + std::size_t i; \ + for (i = 0; i < (overhang + (kBatch / 4) - 1) / (kBatch / 4); ++i) { \ + inputs[i] = &input[fast_end + i * (kBatch / 4)]; \ + } \ + /* These will be clipped off. */ \ + for (; i < 4; ++i) { \ + inputs[i] = &input[fast_end]; \ + } \ + Register result = q.Tile(inputs[0], inputs[1], inputs[2], inputs[3]); \ + std::memcpy(output, &result, overhang); \ +} + /* Take 4 registers with 32-bit values to be horizontally added. Reduce them * to one register with 32-bit values in the pattern 1 2 3 4 1 2 3 4, leaving * the final addition (which crosses 128-bit lanes) to the caller. -template <class Register> inline Register Pack0123(Register sum0, Register sum1, Register sum2, Register sum3) { - // 1 2 1 2 1 2 1 2 - Interleave32(sum0, sum1); - Register pack01 = add_epi32(sum0, sum1); - // 3 4 3 4 3 4 3 4 - Interleave32(sum2, sum3); - Register pack23 = add_epi32(sum2, sum3); - Interleave64(pack01, pack23); - // 1 2 3 4 1 2 3 4 - return add_epi32(pack01, pack23); -} */ #define INTGEMM_PACK0123(target, Register) \ target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Register sum3) { \ @@ -562,20 +580,28 @@ INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3( } \ #define INTGEMM_MAXABSOLUTE(Register, target) \ -target static float MaxAbsolute(const float *begin_float, const float *end_float) { \ +target static inline float MaxAbsolute(const float *begin_float, const float *end_float) { \ assert(end_float > begin_float); \ - assert((end_float - begin_float) % (sizeof(Register) / sizeof(float)) == 0); \ + assert(reinterpret_cast<uintptr_t>(begin_float) % sizeof(Register) == 0); \ const Register *begin = reinterpret_cast<const Register*>(begin_float); \ - const Register *end = reinterpret_cast<const Register*>(end_float); \ - union {float f; int32_t i;} float_convert; \ - float_convert.i = 0x7fffffff; \ - Register and_me = set1_ps<Register>(float_convert.f); \ - Register highest = and_ps(and_me, *begin); \ - for (++begin; begin != end; ++begin) { \ + const float *end_reg = end_float - (reinterpret_cast<uintptr_t>(end_float) % sizeof(Register)) / sizeof(float); \ + const Register *end = reinterpret_cast<const Register*>(end_reg); \ + union {float f; int32_t i;} and_convert, float_convert; \ + and_convert.i = 0x7fffffff; \ + Register and_me = set1_ps<Register>(and_convert.f); \ + Register highest = setzero_ps<Register>(); \ + for (; begin < end; ++begin) { \ Register reg = and_ps(and_me, *begin); \ highest = max_ps(highest, reg); \ } \ - return MaxFloat32(highest); \ + float ret = MaxFloat32(highest); \ + /* Overhang: this would be more efficient if done in a single SIMD operation with some zeroing */ \ + for (const float *i = end_reg; i < end_float; ++i) { \ + float_convert.f = *i; \ + float_convert.i &= and_convert.i; \ + ret = std::max(ret, float_convert.f); \ + } \ + return ret; \ } \ } // namespace intgemm |