From a3a31149281df195e7a7a316f95845b9ef8e1b34 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 19 Apr 2020 13:30:26 +0100 Subject: Sum16To32 using variadic templates --- tile/multiply.inl | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/tile/multiply.inl b/tile/multiply.inl index 3bec91f..f1344d7 100644 --- a/tile/multiply.inl +++ b/tile/multiply.inl @@ -19,16 +19,10 @@ namespace intgemm { namespace INTGEMM_ARCH { // Upcast 16 to 32 if needed. -template struct SumTo32Body; -template <> struct SumTo32Body { - template INTGEMM_TARGET static inline void body(Register *regs) { - Register ® = regs[Iterator::template I<0>()]; - reg = madd_epi16(reg, set1_epi16(1)); - } -}; -template <> struct SumTo32Body { - template INTGEMM_TARGET static inline void body(Register *) {} -}; +template INTGEMM_TARGET static inline void Sum16To32(Register *regs, int16_t, index_sequence) { + unordered_unfurl((regs[i] = madd_epi16(regs[i], set1_epi16(1)))...); +} +template INTGEMM_TARGET static inline void Sum16To32(Register *, int32_t, index_sequence) {} /* Multiply assuming the matrix sizes are a multiple of the kernel size. */ template INTGEMM_TARGET __attribute__((flatten)) static inline void MultiplyNoOverhang(AccessT access, const Tile shape) { @@ -54,8 +48,7 @@ template INTGEMM_TARGET __attribute__((flatten)) s Kernel::Run(reg_access.AAdd(0, inner).BAdd(inner, 0)); } - // If 16-bit, upcast to 32-bit while horizontally adding. - StaticLoop, MakeStaticLoopIterator>(c_regs); + Sum16To32(c_regs, typename Kernel::Packed::C(), make_index_sequence()); // Horizontally add 32-bit values. Reduce32(c_regs); col_row.CAccessor().template Write(c_regs); -- cgit v1.2.3