Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/PackWithRowOffset.cc')
-rw-r--r--src/PackWithRowOffset.cc211
1 files changed, 211 insertions, 0 deletions
diff --git a/src/PackWithRowOffset.cc b/src/PackWithRowOffset.cc
new file mode 100644
index 0000000..8722723
--- /dev/null
+++ b/src/PackWithRowOffset.cc
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cassert>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <stdexcept>
+#include <cpuinfo.h>
+#include "fbgemm/Fbgemm.h"
+
+namespace fbgemm2 {
+
+template <typename T, typename accT>
+PackAWithRowOffset<T, accT>::PackAWithRowOffset(
+ matrix_op_t trans,
+ uint32_t nRow,
+ uint32_t nCol,
+ const T* smat,
+ uint32_t ld,
+ inpType* pmat,
+ uint32_t groups,
+ int32_t zero_pt,
+ int32_t* row_offset)
+ : PackMatrix<PackAWithRowOffset<T, accT>, T, accT>(
+ nRow,
+ nCol,
+ pmat,
+ zero_pt),
+ trans_(trans),
+ smat_(smat),
+ ld_(ld),
+ G_(groups),
+ row_offset_(row_offset) {
+ assert(G_ == 1 && "Groups != 1 not supported yet");
+
+ rowOffsetAllocatedHere = false;
+
+ if (cpuinfo_has_x86_avx512f()) {
+ BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
+ BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
+ row_interleave_B_ =
+ PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
+ } else if (cpuinfo_has_x86_avx2()) {
+ BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::MCB;
+ BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
+ row_interleave_B_ =
+ PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE;
+ } else {
+ //TODO: Have default slower path
+ assert(0 && "unknown architecure");
+ }
+ if (pmat) {
+ BaseType::buf_ = pmat;
+ } else {
+ BaseType::bufAllocatedHere_ = true;
+ BaseType::buf_ =
+ (T*)aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T));
+ }
+ if (!row_offset_) {
+ rowOffsetAllocatedHere = true;
+ row_offset_ = static_cast<int32_t*>(aligned_alloc(64,
+ BaseType::brow_ * sizeof(int32_t)));
+ }
+}
+
+template <typename T, typename accT>
+void PackAWithRowOffset<T, accT>::pack(const block_type_t& block) {
+ assert(block.row_start % BaseType::blockRowSize() == 0);
+ assert(block.col_start % BaseType::blockColSize() == 0);
+ assert(block.row_size <= BaseType::blockRowSize());
+ assert(block.col_size <= BaseType::blockColSize());
+
+ block_type_t block_p = {block.row_start,
+ block.row_size,
+ block.col_start,
+ (block.col_size + row_interleave_B_ - 1) /
+ row_interleave_B_ * row_interleave_B_};
+ assert(block_p.col_size <= BaseType::blockColSize());
+ BaseType::packedBlock(block_p);
+
+ T* out = BaseType::getBuf();
+ bool tr = (trans_ == matrix_op_t::Transpose);
+ // accumulate into row offset?
+ bool row_offset_acc = (block.col_start != 0);
+ int32_t* row_offset_buf = getRowOffsetBuffer();
+ if (tr) {
+ for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ int32_t row_sum = row_offset_acc ?
+ row_offset_buf[i - block.row_start] : 0;
+ for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
+ T val = smat_[i + ld_ * j];
+ row_sum += val;
+ out[(i - block.row_start) * BaseType::blockColSize() +
+ (j - block.col_start)] = val;
+ }
+ row_offset_buf[i - block.row_start] = row_sum;
+ // zero fill
+ // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
+ for (int j = block.col_start + block.col_size;
+ j < block_p.col_start + block_p.col_size; ++j) {
+ out[(i - block.row_start) * BaseType::blockColSize() +
+ (j - block.col_start)] = 0;
+ }
+ }
+ } else {
+ for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ int buf_idx = i - block.row_start;
+ memcpy(
+ out + buf_idx * BaseType::blockColSize(),
+ smat_ + i * ld_ + block.col_start,
+ block.col_size * sizeof(T));
+ // zero fill
+ for (int j = block.col_size; j < block_p.col_size; ++j) {
+ out[buf_idx * BaseType::blockColSize() + j] = 0;
+ }
+ int32_t row_sum = row_offset_acc ?
+ row_offset_buf[i - block.row_start] : 0;
+ __m256i sum_v = _mm256_setzero_si256();
+ __m256i one_epi16_v = _mm256_set1_epi16(1);
+ __m256i one_epi8_v = _mm256_set1_epi8(1);
+ for (int j = block.col_start;
+ j < block.col_start + block.col_size / 32 * 32;
+ j += 32) {
+ __m256i src_v = _mm256_loadu_si256(
+ reinterpret_cast<__m256i const*>(smat_ + i * ld_ + j));
+ sum_v = _mm256_add_epi32(
+ sum_v,
+ _mm256_madd_epi16(
+ _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v));
+ }
+ for (int j = block.col_start + block.col_size / 32 * 32;
+ j < block.col_start + block.col_size;
+ ++j) {
+ row_sum += smat_[i * ld_ + j];
+ }
+ alignas(64) std::array<int32_t, 8> temp;
+ _mm256_store_si256(reinterpret_cast<__m256i*>(temp.data()), sum_v);
+ for (int k = 0; k < 8; ++k) {
+ row_sum += temp[k];
+ }
+ row_offset_buf[i - block.row_start] = row_sum;
+ }
+ }
+}
+
+template <typename T, typename accT>
+int32_t PackAWithRowOffset<T, accT>::addr(int32_t r, int32_t c) const {
+ int32_t block_row_id = r / BaseType::blockRowSize();
+ int32_t brow_offset = (block_row_id * BaseType::blockCols()) *
+ (BaseType::blockRowSize() * BaseType::blockColSize());
+
+ int32_t block_col_id = c / BaseType::blockColSize();
+ int32_t bcol_offset =
+ block_col_id * BaseType::blockRowSize() * BaseType::blockColSize();
+ int32_t block_offset = brow_offset + bcol_offset;
+ int32_t inblock_offset =
+ (r % BaseType::blockRowSize()) * BaseType::blockColSize() +
+ (c % BaseType::blockColSize());
+
+ int32_t index = block_offset + inblock_offset;
+
+ return index;
+}
+
+template <typename T, typename accT>
+void PackAWithRowOffset<T, accT>::printPackedMatrix(std::string name) {
+ std::cout << name << ":"
+ << "[" << BaseType::numPackedRows() << ", "
+ << BaseType::numPackedCols() << "]" << std::endl;
+
+ T* out = BaseType::getBuf();
+ for (auto r = 0; r < BaseType::numPackedRows(); ++r) {
+ for (auto c = 0; c < BaseType::numPackedCols(); ++c) {
+ T val = out[addr(r, c)];
+ if (std::is_integral<T>::value) {
+ // cast to int64 because cout doesn't print int8_t type directly
+ std::cout << std::setw(5) << static_cast<int64_t>(val) << " ";
+ } else {
+ std::cout << std::setw(5) << val << " ";
+ }
+ }
+ std::cout << std::endl;
+ }
+ std::cout << std::endl;
+}
+
+template <typename T, typename accT>
+int PackAWithRowOffset<T, accT>::rowOffsetBufferSize() {
+ if(cpuinfo_initialize()){
+ if (cpuinfo_has_x86_avx512f()) {
+ return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
+ } else if (cpuinfo_has_x86_avx2()) {
+ return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
+ } else {
+ //TODO: Have default slower path
+ assert(0 && "unsupported architecture");
+ return -1;
+ }
+ } else {
+ throw std::runtime_error("Failed to initialize cpuinfo!");
+ }
+}
+
+template class PackAWithRowOffset<uint8_t, int32_t>;
+template class PackAWithRowOffset<uint8_t, int16_t>;
+
+} // namespace fbgemm2