diff options
Diffstat (limited to 'src/PackWithRowOffset.cc')
-rw-r--r-- | src/PackWithRowOffset.cc | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/src/PackWithRowOffset.cc b/src/PackWithRowOffset.cc new file mode 100644 index 0000000..8722723 --- /dev/null +++ b/src/PackWithRowOffset.cc @@ -0,0 +1,211 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include <cassert> +#include <cstring> +#include <iomanip> +#include <iostream> +#include <stdexcept> +#include <cpuinfo.h> +#include "fbgemm/Fbgemm.h" + +namespace fbgemm2 { + +template <typename T, typename accT> +PackAWithRowOffset<T, accT>::PackAWithRowOffset( + matrix_op_t trans, + uint32_t nRow, + uint32_t nCol, + const T* smat, + uint32_t ld, + inpType* pmat, + uint32_t groups, + int32_t zero_pt, + int32_t* row_offset) + : PackMatrix<PackAWithRowOffset<T, accT>, T, accT>( + nRow, + nCol, + pmat, + zero_pt), + trans_(trans), + smat_(smat), + ld_(ld), + G_(groups), + row_offset_(row_offset) { + assert(G_ == 1 && "Groups != 1 not supported yet"); + + rowOffsetAllocatedHere = false; + + if (cpuinfo_has_x86_avx512f()) { + BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; + BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; + row_interleave_B_ = + PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE; + } else if (cpuinfo_has_x86_avx2()) { + BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB; + BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB; + row_interleave_B_ = + PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE; + } else { + //TODO: Have default slower path + assert(0 && "unknown architecure"); + } + if (pmat) { + BaseType::buf_ = pmat; + } else { + BaseType::bufAllocatedHere_ = true; + BaseType::buf_ = + (T*)aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)); + } + if (!row_offset_) { + rowOffsetAllocatedHere = true; + row_offset_ = static_cast<int32_t*>(aligned_alloc(64, + BaseType::brow_ * sizeof(int32_t))); + } +} + +template <typename T, typename accT> +void PackAWithRowOffset<T, accT>::pack(const block_type_t& block) { + assert(block.row_start % BaseType::blockRowSize() == 0); + assert(block.col_start % BaseType::blockColSize() == 0); + assert(block.row_size <= BaseType::blockRowSize()); + assert(block.col_size <= BaseType::blockColSize()); + + block_type_t block_p = {block.row_start, + block.row_size, + block.col_start, + (block.col_size + row_interleave_B_ - 1) / + row_interleave_B_ * row_interleave_B_}; + assert(block_p.col_size <= BaseType::blockColSize()); + BaseType::packedBlock(block_p); + + T* out = BaseType::getBuf(); + bool tr = (trans_ == matrix_op_t::Transpose); + // accumulate into row offset? + bool row_offset_acc = (block.col_start != 0); + int32_t* row_offset_buf = getRowOffsetBuffer(); + if (tr) { + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + int32_t row_sum = row_offset_acc ? + row_offset_buf[i - block.row_start] : 0; + for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { + T val = smat_[i + ld_ * j]; + row_sum += val; + out[(i - block.row_start) * BaseType::blockColSize() + + (j - block.col_start)] = val; + } + row_offset_buf[i - block.row_start] = row_sum; + // zero fill + // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill. + for (int j = block.col_start + block.col_size; + j < block_p.col_start + block_p.col_size; ++j) { + out[(i - block.row_start) * BaseType::blockColSize() + + (j - block.col_start)] = 0; + } + } + } else { + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + int buf_idx = i - block.row_start; + memcpy( + out + buf_idx * BaseType::blockColSize(), + smat_ + i * ld_ + block.col_start, + block.col_size * sizeof(T)); + // zero fill + for (int j = block.col_size; j < block_p.col_size; ++j) { + out[buf_idx * BaseType::blockColSize() + j] = 0; + } + int32_t row_sum = row_offset_acc ? + row_offset_buf[i - block.row_start] : 0; + __m256i sum_v = _mm256_setzero_si256(); + __m256i one_epi16_v = _mm256_set1_epi16(1); + __m256i one_epi8_v = _mm256_set1_epi8(1); + for (int j = block.col_start; + j < block.col_start + block.col_size / 32 * 32; + j += 32) { + __m256i src_v = _mm256_loadu_si256( + reinterpret_cast<__m256i const*>(smat_ + i * ld_ + j)); + sum_v = _mm256_add_epi32( + sum_v, + _mm256_madd_epi16( + _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v)); + } + for (int j = block.col_start + block.col_size / 32 * 32; + j < block.col_start + block.col_size; + ++j) { + row_sum += smat_[i * ld_ + j]; + } + alignas(64) std::array<int32_t, 8> temp; + _mm256_store_si256(reinterpret_cast<__m256i*>(temp.data()), sum_v); + for (int k = 0; k < 8; ++k) { + row_sum += temp[k]; + } + row_offset_buf[i - block.row_start] = row_sum; + } + } +} + +template <typename T, typename accT> +int32_t PackAWithRowOffset<T, accT>::addr(int32_t r, int32_t c) const { + int32_t block_row_id = r / BaseType::blockRowSize(); + int32_t brow_offset = (block_row_id * BaseType::blockCols()) * + (BaseType::blockRowSize() * BaseType::blockColSize()); + + int32_t block_col_id = c / BaseType::blockColSize(); + int32_t bcol_offset = + block_col_id * BaseType::blockRowSize() * BaseType::blockColSize(); + int32_t block_offset = brow_offset + bcol_offset; + int32_t inblock_offset = + (r % BaseType::blockRowSize()) * BaseType::blockColSize() + + (c % BaseType::blockColSize()); + + int32_t index = block_offset + inblock_offset; + + return index; +} + +template <typename T, typename accT> +void PackAWithRowOffset<T, accT>::printPackedMatrix(std::string name) { + std::cout << name << ":" + << "[" << BaseType::numPackedRows() << ", " + << BaseType::numPackedCols() << "]" << std::endl; + + T* out = BaseType::getBuf(); + for (auto r = 0; r < BaseType::numPackedRows(); ++r) { + for (auto c = 0; c < BaseType::numPackedCols(); ++c) { + T val = out[addr(r, c)]; + if (std::is_integral<T>::value) { + // cast to int64 because cout doesn't print int8_t type directly + std::cout << std::setw(5) << static_cast<int64_t>(val) << " "; + } else { + std::cout << std::setw(5) << val << " "; + } + } + std::cout << std::endl; + } + std::cout << std::endl; +} + +template <typename T, typename accT> +int PackAWithRowOffset<T, accT>::rowOffsetBufferSize() { + if(cpuinfo_initialize()){ + if (cpuinfo_has_x86_avx512f()) { + return PackingTraits<T, accT, inst_set_t::avx512>::MCB; + } else if (cpuinfo_has_x86_avx2()) { + return PackingTraits<T, accT, inst_set_t::avx2>::MCB; + } else { + //TODO: Have default slower path + assert(0 && "unsupported architecture"); + return -1; + } + } else { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } +} + +template class PackAWithRowOffset<uint8_t, int32_t>; +template class PackAWithRowOffset<uint8_t, int16_t>; + +} // namespace fbgemm2 |