Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bench/Im2ColFusedRequantizeAcc16Benchmark.cc6
-rw-r--r--bench/Im2ColFusedRequantizeAcc32Benchmark.cc6
-rw-r--r--include/fbgemm/Fbgemm.h53
-rw-r--r--include/fbgemm/OutputProcessing-inl.h5
-rw-r--r--src/ExecuteKernelU8S8.cc55
-rw-r--r--src/Fbgemm.cc73
-rw-r--r--src/PackAMatrix.cc21
-rw-r--r--src/PackAWithIm2Col.cc6
-rw-r--r--src/PackBMatrix.cc70
-rw-r--r--src/PackMatrix.cc3
-rw-r--r--src/PackWithQuantRowOffset.cc15
-rw-r--r--src/PackWithRowOffset.cc15
-rw-r--r--src/RefImplementations.cc34
-rw-r--r--src/RefImplementations.h7
-rw-r--r--test/Im2ColFusedRequantizeTest.cc24
-rw-r--r--test/PackedRequantizeAcc16Test.cc957
-rw-r--r--test/PackedRequantizeTest.cc948
-rw-r--r--test/QuantizationHelpers.cc25
-rw-r--r--test/QuantizationHelpers.h10
-rw-r--r--test/TestUtils.cc29
-rw-r--r--test/TestUtils.h19
21 files changed, 1298 insertions, 1083 deletions
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
index 48b744a..8827b4c 100644
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -189,7 +189,11 @@ void performance_test() {
PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, int16_t> packA(
- conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+ conv_p,
+ Aint8.data(),
+ nullptr,
+ Aint8_zero_point,
+ row_offset_buf.data());
PackBMatrix<int8_t, int16_t> packedB(
matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
index 9201e52..b87f7d7 100644
--- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
@@ -191,7 +191,11 @@ void performance_test() {
PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, int32_t> packA(
- conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+ conv_p,
+ Aint8.data(),
+ nullptr,
+ Aint8_zero_point,
+ row_offset_buf.data());
PackBMatrix<int8_t, int32_t> packedB(
matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index bd1c86e..d05e099 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -20,6 +20,10 @@
#include "Types.h"
#include "Utils.h"
+// Turning on this option will print out time breakdown of each stage (e.g.,
+// input packing, the main GEMM kernel, each output processing pipeline).
+// Please note that currently this option won't report accurate timing if
+// multiple threads are used.
// #define FBGEMM_MEASURE_TIME_BREAKDOWN
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
@@ -80,13 +84,22 @@ class PackMatrix {
* overhead of internal memory allocation everytime a PackMatrix
* is constructed. The client code can query how big patm should
* be with packedBufferSize function.
+ * @param groups when groups > 1, we compute groups number of GEMMs each
+ * multiplies A.rows by A.cols/A.groups matrix with
+ * B.rows/B.groups by B.cols matrix (in conventional BLAS
+ * terminology, this is a batched GEMM but we use the name group
+ * to follow deep learning terminology). The result matrix has
+ * dimension A.rows by B.cols*B.groups .
+ * A.groups must be same as B.groups, A.groups must divide
+ * A.cols, and B.groups must divide B.rows and C.cols.
* @param zero_pt the quantized value that maps to 0.0f floating-point number.
*/
PackMatrix(
std::int32_t rows,
std::int32_t cols,
inpType* pmat,
- std::int32_t zero_pt);
+ int groups = 1,
+ std::int32_t zero_pt = 0);
/**
* @return true usually when the matrix is constant matrix (e.g., weight
@@ -228,6 +241,10 @@ class PackMatrix {
return last_bcol_;
}
+ int numGroups() const {
+ return G_;
+ }
+
/**
* @return True if the last column block has fewer columns than the block
* size.
@@ -275,6 +292,7 @@ class PackMatrix {
private:
std::int32_t nrows_, ncols_;
+ int G_;
std::int32_t zero_pt_;
block_type_t packedBlock_; ///< The block in the source matrix just packed
std::int32_t last_brow_, last_bcol_;
@@ -295,9 +313,6 @@ class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
PackAMatrix() = delete; // no default constructor
- /**
- * TODO: currently only groups == 1 supported
- */
PackAMatrix(
matrix_op_t trans,
std::int32_t nRow,
@@ -305,7 +320,7 @@ class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
const inpType* smat,
std::int32_t ld,
inpType* pmat = nullptr,
- std::int32_t groups = 1,
+ int groups = 1,
std::int32_t zero_pt = 0);
/**
@@ -351,7 +366,6 @@ class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
matrix_op_t trans_;
const T* smat_;
std::int32_t ld_;
- std::int32_t G_;
std::int32_t row_interleave_B_;
};
@@ -370,9 +384,6 @@ class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
PackBMatrix() = delete; // no default constructor
- /**
- * TODO: Currently only groups == 1 supported.
- */
PackBMatrix(
matrix_op_t trans,
std::int32_t nRow,
@@ -380,7 +391,7 @@ class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
const inpType* smat,
std::int32_t ld,
inpType* pmat = nullptr,
- std::int32_t groups = 1,
+ int groups = 1,
std::int32_t zero_pt = 0);
/**
@@ -437,7 +448,6 @@ class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
matrix_op_t trans_;
const T* smat_;
std::int32_t ld_;
- std::int32_t G_;
std::int32_t row_interleave_;
};
@@ -533,9 +543,6 @@ class PackAWithRowOffset final
using accType = accT;
PackAWithRowOffset() = delete; // no default constructor
- /**
- * TODO: Currently only groups == 1 supported
- */
PackAWithRowOffset(
matrix_op_t trans,
std::uint32_t nRow,
@@ -543,7 +550,7 @@ class PackAWithRowOffset final
const T* smat,
std::uint32_t ld,
inpType* pmat = nullptr,
- std::uint32_t groups = 1,
+ int groups = 1,
std::int32_t zero_pt = 0,
std::int32_t* row_offset = nullptr);
@@ -600,7 +607,6 @@ class PackAWithRowOffset final
matrix_op_t trans_;
const T* smat_;
std::uint32_t ld_;
- std::uint32_t G_;
std::int32_t* row_offset_;
bool rowOffsetAllocatedHere;
std::int32_t row_interleave_B_;
@@ -621,9 +627,6 @@ class PackAWithQuantRowOffset final
using accType = accT;
PackAWithQuantRowOffset() = delete; // no default constructor
- /**
- * TODO: Currently only groups == 1 supported
- */
PackAWithQuantRowOffset(
matrix_op_t trans,
std::int32_t nRow,
@@ -633,7 +636,7 @@ class PackAWithQuantRowOffset final
inpType* pmat = nullptr,
float scale = 1.0f,
std::int32_t zero_pt = 0,
- std::int32_t groups = 1,
+ int groups = 1,
std::int32_t* row_offset = nullptr);
/**
@@ -690,7 +693,6 @@ class PackAWithQuantRowOffset final
const float* smat_;
std::int32_t ld_;
float scale_;
- std::int32_t G_;
std::int32_t* row_offset_;
bool rowOffsetAllocatedHere;
std::int32_t row_interleave_B_;
@@ -811,7 +813,8 @@ class ReluOutput {
* processing pipeline.
*
* SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer
- * (inp). After modifying the input buffer, pass it to the next op
+ * (inp). After modifying the input buffer, pass it to the next op.
+ * When groups > 1, each group is numRows() x (numCols()/groups) matrix.
*/
template <
typename outT = std::int32_t,
@@ -825,8 +828,9 @@ class DoSpmdmOnInpBuffer {
nextOPType& nextop,
const std::uint8_t* A,
int lda,
- const CompressedSparseColumn& B_csc)
- : nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc) {}
+ const CompressedSparseColumn& B_csc,
+ int groups = 1)
+ : nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc), groups_(groups) {}
template <inst_set_t instSet>
inline int f(
@@ -841,6 +845,7 @@ class DoSpmdmOnInpBuffer {
const std::uint8_t* A_;
const int lda_;
const CompressedSparseColumn& B_csc_;
+ const int groups_;
};
/**
diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h
index 13f614a..71d8046 100644
--- a/include/fbgemm/OutputProcessing-inl.h
+++ b/include/fbgemm/OutputProcessing-inl.h
@@ -28,7 +28,10 @@ template <typename outT, typename inT, typename nextOPType>
template<inst_set_t instSet>
inline int DoSpmdmOnInpBuffer<outT, inT, nextOPType>::f(outT* out, inT* inp,
const block_type_t& block, int ld_out, int ld_in) const {
- B_csc_.SpMDM(block, A_, lda_, true, inp, ld_in);
+ assert(B_csc_.NumOfCols() % groups_ == 0);
+ int n_per_group = B_csc_.NumOfCols() / groups_;
+ int g = block.col_start / n_per_group;
+ B_csc_.SpMDM(block, A_ + g * B_csc_.NumOfRows(), lda_, true, inp, ld_in);
return nextop_.template f<instSet>(out, inp, block, ld_out, ld_in);
}
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index b3f8c15..c2079b1 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -8,7 +8,6 @@
#include <cpuinfo.h>
#include <chrono>
-
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
double kernel_time = 0.0;
double postprocessing_time = 0.0;
@@ -84,8 +83,10 @@ void ExecuteKernel<
int32_t packed_rows_A = packedA_.numPackedRows();
int32_t row_start_A = packedA_.packedRowStart();
- bool lastKBlock = packedB_.isThisLastKBlock(kBlock);
- bool accum = kBlock > 0;
+ int group = kBlock / packedB_.blockRows();
+ int NDim = packedB_.numCols();
+ bool lastKBlock = packedB_.isThisLastKBlock(kBlock % packedB_.blockRows());
+ bool accum = (kBlock % packedB_.blockRows()) > 0;
typename BaseType::jit_micro_kernel_fp fn;
@@ -120,7 +121,6 @@ void ExecuteKernel<
#endif
for (int jb = 0; jb < bColBlocks; ++jb) {
-
bBuf = packedB_.getBuf(jb, kBlock);
// prefetch addr of the next packed block of B matrix
bBuf_pf = packedB_.getBuf(jb == bColBlocks - 1 ? jb : jb + 1, kBlock);
@@ -128,12 +128,14 @@ void ExecuteKernel<
// Reuse the first rowblock of C_buffer_ unless when C_buffer_ is same as
// matC_ (inplace output processing)
int32_t* C_buffer_row_start = C_buffer_ +
- ((C_buffer_ == reinterpret_cast<int32_t*>(matC_)) ? row_start_A * ldc_
- : 0);
+ ((C_buffer_ == reinterpret_cast<int32_t*>(matC_))
+ ? row_start_A * ldc_ + NDim * group
+ : 0);
int32_t* C_buffer_start = C_buffer_row_start + jb * nbSize_;
int32_t leadingDim = ldc_;
if (packedB_.isThereColRemainder() && (jb == bColBlocks - 1)) {
- // In case we will access memory past C_buffer_, we use C_tile_ instead.
+ // In case we will access memory past C_buffer_, we use C_tile_ scratchpad
+ // instead.
C_buffer_start = C_tile_;
leadingDim = nbSize_;
}
@@ -146,14 +148,15 @@ void ExecuteKernel<
leadingDim);
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
- .count();
- kernel_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
+ t_end = std::chrono::high_resolution_clock::now();
+ dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
+ .count();
+ kernel_time += (dt);
+ t_start = std::chrono::high_resolution_clock::now();
#endif
- // Output processing is done only once per rowblock
+ // Output processing is done only once per rowblock to amortize overhead
+ // and for better spatial locality.
if (lastKBlock && jb == bColBlocks - 1) {
// When C_tile_ is used for the last column block, we need a separate
// handling for the last column block.
@@ -166,14 +169,14 @@ void ExecuteKernel<
outputProcess_.template f<inst_set_t::avx2>(
matC_,
C_buffer_row_start,
- {row_start_A, packed_rows_A, 0, nSize},
+ {row_start_A, packed_rows_A, NDim * group, nSize},
ldc_,
ldc_);
} else if (cpuinfo_has_x86_avx2()) {
outputProcess_.template f<inst_set_t::avx2>(
matC_,
C_buffer_row_start,
- {row_start_A, packed_rows_A, 0, nSize},
+ {row_start_A, packed_rows_A, NDim * group, nSize},
ldc_,
ldc_);
} else {
@@ -183,20 +186,28 @@ void ExecuteKernel<
}
if (C_buffer_start == C_tile_) {
+ // When C_tile_ scratchpad was used to avoid accessing memory past
+ // C_buffer_ .
if (cpuinfo_has_x86_avx512f()) {
// TODO: avx512 path
// Currently use avx2 code
outputProcess_.template f<inst_set_t::avx2>(
matC_,
C_tile_,
- {row_start_A, packed_rows_A, jb * nbSize_, packedB_.lastBcol()},
+ {row_start_A,
+ packed_rows_A,
+ NDim * group + jb * nbSize_,
+ packedB_.lastBcol()},
ldc_,
leadingDim);
} else if (cpuinfo_has_x86_avx2()) {
outputProcess_.template f<inst_set_t::avx2>(
matC_,
C_tile_,
- {row_start_A, packed_rows_A, jb * nbSize_, packedB_.lastBcol()},
+ {row_start_A,
+ packed_rows_A,
+ NDim * group + jb * nbSize_,
+ packedB_.lastBcol()},
ldc_,
leadingDim);
} else {
@@ -207,11 +218,11 @@ void ExecuteKernel<
} // output processing
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
- .count();
- postprocessing_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
+ t_end = std::chrono::high_resolution_clock::now();
+ dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
+ .count();
+ postprocessing_time += (dt);
+ t_start = std::chrono::high_resolution_clock::now();
#endif
} // for each j block
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index f8f0d34..8376077 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -76,23 +76,31 @@ void fbgemmPacked(
throw std::runtime_error("Failed to initialize cpuinfo!");
}
+ if (!packB.isPrePacked()) {
+ throw std::runtime_error("B matrix must be prepacked");
+ }
+ if (packA.numGroups() != packB.numGroups()) {
+ throw std::runtime_error(
+ "A.groups = " + std::to_string(packA.numGroups()) + " and B.groups = " +
+ std::to_string(packB.numGroups()) + " are not the same");
+ }
+
int MDim = packA.numRows();
int KDim = packB.numRows();
+ int KDimPerGroup = packB.numRows() / packB.numGroups();
+ int NDim = packB.numCols();
int mBlocks = (MDim + MCB - 1) / MCB;
- int kBlocks = (KDim + KCB - 1) / KCB;
+ int kBlocks = (KDimPerGroup + KCB - 1) / KCB;
// remainders
int _mc = MDim % MCB;
- int _kc = KDim % KCB;
+ int _kc = KDimPerGroup % KCB;
int kc, mc;
block_type_t blockA{0, 0, 0, 0};
- // B must be prepacked
- assert(packB.isPrePacked() && "B matrix must be prepacked");
-
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
std::chrono::time_point<std::chrono::high_resolution_clock> t_very_start,
t_start, t_end;
@@ -101,37 +109,48 @@ void fbgemmPacked(
t_very_start = std::chrono::high_resolution_clock::now();
#endif
- ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType>
- exeKernelObj(packA, packB, 0, C, C_buffer, ldc, outProcess);
// ToDo: thread based work division
- for (int i = 0; i < mBlocks; ++i) {
- mc = (i != mBlocks - 1 || _mc == 0) ? MCB : _mc;
- for (int k = 0; k < kBlocks; ++k) {
- kc = (k != kBlocks - 1 || _kc == 0) ? KCB : _kc;
- // pack A matrix
- blockA = {i * MCB, mc, k * KCB, kc};
-
- packA.pack(blockA);
+ for (int g = 0; g < packA.numGroups(); ++g) {
+ ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType>
+ exeKernelObj(
+ packA,
+ packB,
+ 0,
+ C,
+ C_buffer,
+ ldc,
+ outProcess);
+ for (int i = 0; i < mBlocks; ++i) {
+ mc = (i != mBlocks - 1 || _mc == 0) ? MCB : _mc;
+ for (int k = 0; k < kBlocks; ++k) {
+ kc = (k != kBlocks - 1 || _kc == 0) ? KCB : _kc;
+ // pack A matrix
+ blockA = {i * MCB, mc, g * KDimPerGroup + k * KCB, kc};
+
+ packA.pack(blockA);
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
- .count();
- packing_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
+ t_end = std::chrono::high_resolution_clock::now();
+ dt = std::chrono::duration_cast<std::chrono::nanoseconds>(
+ t_end - t_start)
+ .count();
+ packing_time += (dt);
+ t_start = std::chrono::high_resolution_clock::now();
#endif
- exeKernelObj.execute(k);
+ exeKernelObj.execute(g * kBlocks + k);
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- t_end = std::chrono::high_resolution_clock::now();
- dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
- .count();
- computing_time += (dt);
- t_start = std::chrono::high_resolution_clock::now();
+ t_end = std::chrono::high_resolution_clock::now();
+ dt = std::chrono::duration_cast<std::chrono::nanoseconds>(
+ t_end - t_start)
+ .count();
+ computing_time += (dt);
+ t_start = std::chrono::high_resolution_clock::now();
#endif
+ }
}
- }
+ } // for each group
#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
t_end = std::chrono::high_resolution_clock::now();
diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc
index cd991ca..988a27b 100644
--- a/src/PackAMatrix.cc
+++ b/src/PackAMatrix.cc
@@ -20,15 +20,17 @@ PackAMatrix<T, accT>::PackAMatrix(
const T* smat,
int32_t ld,
inpType* pmat,
- int32_t groups,
+ int groups,
std::int32_t zero_pt)
- : PackMatrix<PackAMatrix<T, accT>, T, accT>(nRow, nCol, pmat, zero_pt),
+ : PackMatrix<PackAMatrix<T, accT>, T, accT>(
+ nRow,
+ nCol,
+ pmat,
+ groups,
+ zero_pt),
trans_(trans),
smat_(smat),
- ld_(ld),
- G_(groups) {
- assert(G_ == 1 && "Groups != 1 not supported yet");
-
+ ld_(ld) {
if (cpuinfo_has_x86_avx512f()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
@@ -43,6 +45,11 @@ PackAMatrix<T, accT>::PackAMatrix(
// TODO: Have default slower path
assert(0 && "unsupported architecure");
}
+ if (BaseType::numCols() % groups != 0) {
+ throw std::runtime_error(
+ "groups = " + std::to_string(groups) +
+ " does not divide numCols = " + std::to_string(BaseType::numCols()));
+ }
if (!pmat) {
BaseType::buf_ = (T*)fbgemmAlignedAlloc(
64, BaseType::brow_ * BaseType::bcol_ * sizeof(T));
@@ -63,7 +70,7 @@ void PackAMatrix<T, accT>::pack(const block_type_t& block) {
if (tr) {
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
- T val = smat_[i + ld_ * j];
+ T val = smat_[i + j * ld_];
out[addr(i, j) - addr(block.row_start, block.col_start)] = val;
}
// zero fill
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index 71efced..7ca7059 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -36,6 +36,7 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
std::multiplies<int>()) *
conv_p.IC,
pmat,
+ conv_p.G,
zero_pt),
conv_p_(conv_p),
sdata_(sdata) {
@@ -55,6 +56,11 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
// TODO: Have default slower path
assert(0 && "unsupported architecure");
}
+ if (BaseType::numCols() % conv_p.G != 0) {
+ throw std::runtime_error(
+ "groups = " + std::to_string(conv_p.G) +
+ " does not divide numCols = " + std::to_string(BaseType::numCols()));
+ }
if (pmat) {
BaseType::buf_ = pmat;
} else {
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index 485afb1..1bb7d4b 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -20,15 +20,17 @@ PackBMatrix<T, accT>::PackBMatrix(
const T* smat,
int32_t ld,
inpType* pmat,
- int32_t groups,
+ int groups,
std::int32_t zero_pt)
- : PackMatrix<PackBMatrix<T, accT>, T, accT>(nRow, nCol, pmat, zero_pt),
+ : PackMatrix<PackBMatrix<T, accT>, T, accT>(
+ nRow,
+ nCol,
+ pmat,
+ groups,
+ zero_pt),
trans_(trans),
smat_(smat),
- ld_(ld),
- G_(groups) {
- assert(G_ == 1 && "Groups != 1 not supported yet");
-
+ ld_(ld) {
if (cpuinfo_has_x86_avx512f()) {
BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB;
@@ -42,14 +44,22 @@ PackBMatrix<T, accT>::PackBMatrix(
// Error
assert(0 && "unknown architecure");
}
- block_type_t block{0, BaseType::numRows(), 0, BaseType::numCols()};
+ if (BaseType::numRows() % groups != 0) {
+ throw std::runtime_error(
+ "groups = " + std::to_string(groups) +
+ " does not divide numRows = " + std::to_string(BaseType::numRows()));
+ }
+
+ // blocking for one group
+ block_type_t block{
+ 0, BaseType::numRows() / BaseType::numGroups(), 0, BaseType::numCols()};
BaseType::packedBlock(block);
if (!pmat) {
BaseType::bufAllocatedHere_ = true;
BaseType::buf_ = (T*)fbgemmAlignedAlloc(
64,
- BaseType::blockRows() * BaseType::brow_ * BaseType::blockCols() *
- BaseType::bcol_ * sizeof(T));
+ BaseType::numGroups() * BaseType::blockRows() * BaseType::brow_ *
+ BaseType::blockCols() * BaseType::bcol_ * sizeof(T));
}
pack(block);
}
@@ -59,26 +69,28 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
assert((BaseType::blockRowSize() % row_interleave_) == 0);
BaseType::packedBlock(block);
- T* out = BaseType::getBuf();
bool tr = (trans_ == matrix_op_t::Transpose);
- for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
- for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
- T val = tr ? smat_[i + ld_ * j] : smat_[i * ld_ + j];
- out[addr(i, j) - addr(block.row_start, block.col_start)] =
- tconv(val, out[addr(i, j)]);
+ for (int g = 0; g < this->numGroups(); ++g) {
+ T* out = BaseType::getBuf() +
+ g * this->packedBufferSize(block.row_size, block.col_size);
+ for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
+ T val = tr ? smat_[g * block.row_size + i + ld_ * j]
+ : smat_[(g * block.row_size + i) * ld_ + j];
+ out[addr(i, j)] = tconv(val, out[addr(i, j)]);
+ }
}
- }
- // fill the remaining with zero.
- // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
- for (int i = block.row_start + block.row_size;
- i < (block.row_start + block.row_size + row_interleave_ - 1) /
- row_interleave_ * row_interleave_;
- ++i) {
- for (int j = block.col_start; j < block.col_start + block.col_size; j++) {
- out[addr(i, j) - addr(block.row_start, block.col_start)] =
- tconv(0, out[addr(i, j)]);
+ // fill the remaining with zero.
+ // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
+ for (int i = block.row_start + block.row_size;
+ i < (block.row_start + block.row_size + row_interleave_ - 1) /
+ row_interleave_ * row_interleave_;
+ ++i) {
+ for (int j = block.col_start; j < block.col_start + block.col_size; j++) {
+ out[addr(i, j)] = tconv(0, out[addr(i, j)]);
+ }
}
- }
+ } // for each group
}
template <typename T, typename accT>
@@ -109,7 +121,8 @@ void PackBMatrix<T, accT>::printPackedMatrix(std::string name) {
<< "[" << BaseType::blockRowSize() << ", "
<< BaseType::blockColSize() << "]" << std::endl;
- T* out = BaseType::getBuf();
+ T* out = BaseType::getBuf() +
+ this->packedBufferSize(this->numPackedRows(), this->numPackedCols());
for (auto nr = 0; nr < BaseType::blockRows(); ++nr) {
auto rows = (nr == BaseType::blockRows() - 1) ? BaseType::lastBrow()
: BaseType::blockRowSize();
@@ -150,7 +163,8 @@ bool PackBMatrix<T, accT>::metaEquals(const PackBMatrix<T, accT>& that) const {
BaseType::numPackedRows() != that.numPackedRows() ||
BaseType::numPackedCols() != that.numPackedCols() ||
BaseType::zeroPoint() != that.zeroPoint() || trans_ != that.trans_ ||
- G_ != that.G_ || row_interleave_ != that.row_interleave_) {
+ BaseType::numGroups() != that.numGroups() ||
+ row_interleave_ != that.row_interleave_) {
return false;
}
diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc
index b8b8388..a577057 100644
--- a/src/PackMatrix.cc
+++ b/src/PackMatrix.cc
@@ -18,8 +18,9 @@ PackMatrix<PT, inpType, accType>::PackMatrix(
int32_t rows,
int32_t cols,
inpType* buf,
+ int groups,
int32_t zero_pt)
- : buf_(buf), nrows_(rows), ncols_(cols), zero_pt_(zero_pt) {
+ : buf_(buf), nrows_(rows), ncols_(cols), G_(groups), zero_pt_(zero_pt) {
bufAllocatedHere_ = false;
if (!cpuinfo_initialize()) {
throw std::runtime_error("Failed to initialize cpuinfo!");
diff --git a/src/PackWithQuantRowOffset.cc b/src/PackWithQuantRowOffset.cc
index a3d7e0f..2bd4b15 100644
--- a/src/PackWithQuantRowOffset.cc
+++ b/src/PackWithQuantRowOffset.cc
@@ -25,21 +25,19 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
inpType* pmat,
float scale,
int32_t zero_pt,
- int32_t groups,
+ int groups,
int32_t* row_offset)
: PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT>(
nRow,
nCol,
pmat,
+ groups,
zero_pt),
trans_(trans),
smat_(smat),
ld_(ld),
scale_(scale),
- G_(groups),
row_offset_(row_offset) {
- assert(G_ == 1 && "Groups != 1 not supported yet");
-
rowOffsetAllocatedHere = false;
if (cpuinfo_has_x86_avx512f()) {
@@ -56,6 +54,11 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
// TODO: Have default slower path
assert(0 && "unknown architecure");
}
+ if (BaseType::numCols() % groups != 0) {
+ throw std::runtime_error(
+ "groups = " + std::to_string(groups) +
+ " does not divide numCols = " + std::to_string(BaseType::numCols()));
+ }
if (pmat) {
BaseType::buf_ = pmat;
} else {
@@ -73,7 +76,6 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
template <typename T, typename accT>
void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) {
assert(block.row_start % BaseType::blockRowSize() == 0);
- assert(block.col_start % BaseType::blockColSize() == 0);
assert(block.row_size <= BaseType::blockRowSize());
assert(block.col_size <= BaseType::blockColSize());
@@ -88,7 +90,8 @@ void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) {
T* out = BaseType::getBuf();
bool tr = (trans_ == matrix_op_t::Transpose);
// accumulate into row offset?
- bool row_offset_acc = (block.col_start != 0);
+ bool row_offset_acc =
+ (block.col_start % (this->numCols() / this->numGroups())) != 0;
int32_t* row_offset_buf = getRowOffsetBuffer();
float smat_transposed[block.row_size * block.col_size];
diff --git a/src/PackWithRowOffset.cc b/src/PackWithRowOffset.cc
index dec3f70..1935161 100644
--- a/src/PackWithRowOffset.cc
+++ b/src/PackWithRowOffset.cc
@@ -22,21 +22,19 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
const T* smat,
uint32_t ld,
inpType* pmat,
- uint32_t groups,
+ int groups,
int32_t zero_pt,
int32_t* row_offset)
: PackMatrix<PackAWithRowOffset<T, accT>, T, accT>(
nRow,
nCol,
pmat,
+ groups,
zero_pt),
trans_(trans),
smat_(smat),
ld_(ld),
- G_(groups),
row_offset_(row_offset) {
- assert(G_ == 1 && "Groups != 1 not supported yet");
-
rowOffsetAllocatedHere = false;
if (cpuinfo_has_x86_avx512f()) {
@@ -53,6 +51,11 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
// TODO: Have default slower path
assert(0 && "unknown architecure");
}
+ if (BaseType::numCols() % groups != 0) {
+ throw std::runtime_error(
+ "groups = " + std::to_string(groups) +
+ " does not divide numCols = " + std::to_string(BaseType::numCols()));
+ }
if (pmat) {
BaseType::buf_ = pmat;
} else {
@@ -70,7 +73,6 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
template <typename T, typename accT>
void PackAWithRowOffset<T, accT>::pack(const block_type_t& block) {
assert(block.row_start % BaseType::blockRowSize() == 0);
- assert(block.col_start % BaseType::blockColSize() == 0);
assert(block.row_size <= BaseType::blockRowSize());
assert(block.col_size <= BaseType::blockColSize());
@@ -85,7 +87,8 @@ void PackAWithRowOffset<T, accT>::pack(const block_type_t& block) {
T* out = BaseType::getBuf();
bool tr = (trans_ == matrix_op_t::Transpose);
// accumulate into row offset?
- bool row_offset_acc = (block.col_start != 0);
+ bool row_offset_acc =
+ (block.col_start % (this->numCols() / this->numGroups())) != 0;
int32_t* row_offset_buf = getRowOffsetBuffer();
if (tr) {
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index dc41c27..45eb9f9 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -93,8 +93,8 @@ void matmul_u8i8acc32_ref(
const uint8_t* Aint8,
const int8_t* Bint8,
int32_t* Cint32) {
- for (int j = 0; j < N; ++j) {
- for (int i = 0; i < M; ++i) {
+ for (int i = 0; i < M; ++i) {
+ for (int j = 0; j < N; ++j) {
int32_t sum = 0;
for (int k = 0; k < K; ++k) {
sum += static_cast<int32_t>(Aint8[i * lda + k]) *
@@ -116,8 +116,8 @@ void matmul_u8i8acc16_ref(
const uint8_t* Aint8,
const int8_t* Bint8,
int32_t* Cint32) {
- for (int j = 0; j < N; ++j) {
- for (int i = 0; i < M; ++i) {
+ for (int i = 0; i < M; ++i) {
+ for (int j = 0; j < N; ++j) {
int32_t sum = 0, sum_32bit = 0;
for (int k = 0; k < K; k += 2) {
int a0 = Aint8[i * lda + k];
@@ -148,8 +148,8 @@ void matmul_fp_ref(
const float* Afp32,
const float* Bfp32,
float* Cfp32) {
- for (int j = 0; j < N; ++j) {
- for (int i = 0; i < M; ++i) {
+ for (int i = 0; i < M; ++i) {
+ for (int j = 0; j < N; ++j) {
float sum = 0;
for (int k = 0; k < K; ++k) {
sum += Afp32[i * lda + k] * Bfp32[k * ldb + j];
@@ -198,8 +198,10 @@ void spmdm_ref(
fbgemm::CompressedSparseColumn& B,
bool accumulation,
int32_t* C,
- int ldc) {
+ int ldc,
+ int groups /*=1*/) {
int N = B.NumOfCols();
+ assert(N % groups == 0);
if (!accumulation) {
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
@@ -207,15 +209,17 @@ void spmdm_ref(
}
}
}
- for (int j = 0; j < N; ++j) {
- for (int k = B.ColPtr()[j]; k < B.ColPtr()[j + 1]; ++k) {
- int row = B.RowIdx()[k];
- int w = B.Values()[k];
- for (int i = 0; i < M; ++i) {
- C[i * ldc + j] += A[i * lda + row] * w;
+ for (int g = 0; g < groups; ++g) {
+ for (int j = g * (N / groups); j < (g + 1) * (N / groups); ++j) {
+ for (int k = B.ColPtr()[j]; k < B.ColPtr()[j + 1]; ++k) {
+ int row = g * B.NumOfRows() + B.RowIdx()[k];
+ int w = B.Values()[k];
+ for (int i = 0; i < M; ++i) {
+ C[i * ldc + j] += A[i * lda + row] * w;
+ }
}
- }
- } // for each column of B
+ } // for each column of B
+ } // for each group
}
int32_t clip_16bit(int32_t x) {
diff --git a/src/RefImplementations.h b/src/RefImplementations.h
index 9e81ce1..cec4bff 100644
--- a/src/RefImplementations.h
+++ b/src/RefImplementations.h
@@ -125,6 +125,10 @@ void col_offsets_with_zero_pt_s8acc32_ref(
/**
* @brief Reference implementation of SPMDM (sparse matrix times dense matrix).
+ *
+ * @param groups when > 1, for gth group, we multiply
+ * A[:,g*(A.ncols/groups):(g+1)*(A.ncols/groups)] sub-matrix with
+ * B[:,g*(B.ncols/groups):(g+1)*(B.ncols/groups)] sub-matrix .
*/
void spmdm_ref(
int M,
@@ -133,7 +137,8 @@ void spmdm_ref(
CompressedSparseColumn& B,
bool accumulation,
std::int32_t* C,
- int ldc);
+ int ldc,
+ int groups = 1);
/*
* @brief Trim a 32-bit integer to a 16-bit integer.
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index 391b993..bab3e72 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -135,7 +135,11 @@ TEST(FBGemmIm2colTest, Acc32Test) {
PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, int32_t> packA(
- conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+ conv_p,
+ Aint8.data(),
+ nullptr,
+ Aint8_zero_point,
+ row_offset_buf.data());
PackBMatrix<int8_t, int32_t> packedB(
matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
@@ -254,7 +258,11 @@ TEST(FBGemmIm2colTest, Acc16Test) {
PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, int16_t> packA(
- conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+ conv_p,
+ Aint8.data(),
+ nullptr,
+ Aint8_zero_point,
+ row_offset_buf.data());
PackBMatrix<int8_t, int16_t> packedB(
matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
@@ -452,7 +460,11 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) {
PackAWithIm2Col<uint8_t, int32_t, 3>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, int32_t, 3> packA(
- conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+ conv_p,
+ Aint8.data(),
+ nullptr,
+ Aint8_zero_point,
+ row_offset_buf.data());
PackBMatrix<int8_t, int32_t> packedB(
matrix_op_t::NoTranspose,
@@ -588,7 +600,11 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) {
PackAWithIm2Col<uint8_t, int16_t, 3>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, int16_t, 3> packA(
- conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
+ conv_p,
+ Aint8.data(),
+ nullptr,
+ Aint8_zero_point,
+ row_offset_buf.data());
PackBMatrix<int8_t, int16_t> packedB(
matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
index 82ae96f..7f0a7c7 100644
--- a/test/PackedRequantizeAcc16Test.cc
+++ b/test/PackedRequantizeAcc16Test.cc
@@ -82,141 +82,169 @@ TEST_P(fbgemmu8s8acc16test, Test) {
tie(atrans, btrans, test_ld) = GetParam();
for (auto shape : shapes) {
- int m = shape[0];
- int n = shape[1];
- int k = shape[2];
-
- aligned_vector<uint8_t> Aint8(m * k, 0);
- aligned_vector<int8_t> Bint8(k * n, 0);
- aligned_vector<int8_t> Bint8_ref(k * n, 0);
- aligned_vector<int32_t> Cint32_local(m * n, 0);
- aligned_vector<int32_t> Cint32_buffer(m * n, 0);
- aligned_vector<int32_t> Cint32_fb(m * n, 0);
- aligned_vector<uint8_t> Cint8_fb(m * n, 0);
- aligned_vector<uint8_t> Cint8_local(m * n, 0);
-
- randFill(Aint8, 0, 255);
- int32_t Aint8_zero_point = 43;
-
- randFill(Bint8_ref, -128, 127);
-
- for (auto i = 0; i < Bint8.size(); ++i) {
- Bint8[i] = Bint8_ref[i];
- }
-
- if (btrans == matrix_op_t::Transpose) {
- transpose_matrix(Bint8.data(), k, n);
- }
-
- int32_t Bint8_zero_point = -30;
- // To test lda != k , we just reduce k by half and use the original k
- // as lda.
- int k_adjusted = k;
- int n_adjusted = n;
- if (test_ld) {
- assert(
- atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
- k_adjusted = std::max(k / 2, 1);
- if (btrans == matrix_op_t::NoTranspose) {
- n_adjusted = std::max(n / 2, 1);
+ for (int groups : {1, 3, 4}) {
+ int m = shape[0];
+ int n = shape[1];
+ int k = shape[2];
+ if (k % groups != 0) {
+ continue;
}
- }
-
- // computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(n_adjusted);
- col_offsets_with_zero_pt_s8acc32_ref(
- k_adjusted,
- n_adjusted,
- n,
- Bint8_ref.data(),
- Bint8_zero_point,
- col_offsets.data());
-
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
-
- float C_multiplier = 0.1234;
- int32_t C_zero_pt = 5;
-
- int brow = 256;
- matmul_u8i8acc16_ref(
- m,
- n_adjusted,
- k_adjusted,
- k,
- n,
- n,
- brow,
- Aint8.data(),
- Bint8_ref.data(),
- Cint32_local.data());
-
- row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data());
-
- requantize_u8acc32_ref(
- m,
- n_adjusted,
- n,
- Cint32_local.data(),
- Cint8_local.data(),
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- row_offsets.data(),
- col_offsets.data(),
- nullptr);
-
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
- PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
-
- PackAWithRowOffset<uint8_t, int16_t> packAN(
- matrix_op_t::NoTranspose,
- m,
- k_adjusted,
- Aint8.data(),
- k,
- nullptr,
- 1,
- Aint8_zero_point,
- row_offset_buf.data());
-
- PackBMatrix<int8_t, int16_t> packedBN(
- btrans,
- k_adjusted,
- n_adjusted,
- Bint8.data(),
- (btrans == matrix_op_t::Transpose) ? k : n,
- nullptr,
- 1,
- Bint8_zero_point);
-
- DoNothing<> doNothingObj{};
- ReQuantizeOutput<false> outputProcObj(
- doNothingObj,
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- packAN.getRowOffsetBuffer(),
- col_offsets.data(),
- nullptr);
-
- fbgemmPacked(
- packAN,
- packedBN,
- Cint8_fb.data(),
- Cint32_buffer.data(),
- n,
- outputProcObj,
- 0,
- 1);
-
- compare_validate_buffers(
- Cint8_local.data(), Cint8_fb.data(), m, n, n, static_cast<uint8_t>(0));
- }
+ int k_per_group = k / groups;
+
+ aligned_vector<uint8_t> Aint8(m * k, 0);
+
+ aligned_vector<int8_t> Bint8(k * n, 0);
+ aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+
+ aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
+ aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+ aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+
+ randFill(Aint8, 0, 255);
+ int32_t Aint8_zero_point = 43;
+
+ randFill(Bint8_ref, -128, 127);
+ Bint8 = Bint8_ref;
+
+ if (btrans == matrix_op_t::Transpose) {
+ aligned_vector<int8_t> Bint8_temp(Bint8.size());
+ for (int g = 0; g < groups; ++g) {
+ transpose_matrix(
+ k_per_group,
+ n,
+ Bint8.data() + g * k_per_group * n,
+ n,
+ Bint8_temp.data() + g * k_per_group,
+ groups * k_per_group);
+ }
+ Bint8 = Bint8_temp;
+ }
+
+ int32_t Bint8_zero_point = -30;
+ // To test lda != k , we just reduce k by half and use the original k
+ // as lda.
+ int n_adjusted = n;
+ if (test_ld) {
+ assert(
+ atrans == matrix_op_t::NoTranspose &&
+ "This case is not handled yet");
+ if (btrans == matrix_op_t::NoTranspose) {
+ n_adjusted = std::max(n / 2, 1);
+ }
+ }
+
+ // computing column offset
+ vector<int32_t> col_offsets;
+ col_offsets.resize(groups * n_adjusted);
+ for (int g = 0; g < groups; ++g) {
+ col_offsets_with_zero_pt_s8acc32_ref(
+ k_per_group,
+ n_adjusted,
+ n,
+ Bint8_ref.data() + g * k_per_group * n,
+ Bint8_zero_point,
+ col_offsets.data() + g * n_adjusted);
+ }
+
+ vector<int32_t> row_offsets;
+ row_offsets.resize(m);
+
+ float C_multiplier = 0.1234;
+ int32_t C_zero_pt = 5;
+
+ int brow = 256;
+ for (int g = 0; g < groups; ++g) {
+ matmul_u8i8acc16_ref(
+ m,
+ n_adjusted,
+ k_per_group,
+ k,
+ n,
+ groups * n,
+ brow,
+ Aint8.data() + g * k_per_group,
+ Bint8_ref.data() + g * k_per_group * n,
+ Cint32_ref.data() + g * n_adjusted);
+
+ row_offsets_u8acc32_ref(
+ m,
+ k_per_group,
+ k,
+ Aint8.data() + g * k_per_group,
+ row_offsets.data());
+
+ requantize_u8acc32_ref(
+ m,
+ n_adjusted,
+ groups * n,
+ Cint32_ref.data() + g * n_adjusted,
+ Cint8_ref.data() + g * n_adjusted,
+ C_multiplier,
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point,
+ row_offsets.data(),
+ col_offsets.data() + g * n_adjusted,
+ nullptr);
+ }
+
+ vector<int32_t> row_offset_buf;
+ row_offset_buf.resize(
+ PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
+
+ PackAWithRowOffset<uint8_t, int16_t> packAN(
+ matrix_op_t::NoTranspose,
+ m,
+ k,
+ Aint8.data(),
+ k,
+ nullptr,
+ groups,
+ Aint8_zero_point,
+ row_offset_buf.data());
+
+ PackBMatrix<int8_t, int16_t> packedBN(
+ btrans,
+ k,
+ n_adjusted,
+ Bint8.data(),
+ (btrans == matrix_op_t::Transpose) ? k : n,
+ nullptr,
+ groups,
+ Bint8_zero_point);
+
+ DoNothing<> doNothingObj{};
+ ReQuantizeOutput<false> outputProcObj(
+ doNothingObj,
+ C_multiplier,
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point,
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr);
+
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint8_fb.data(),
+ Cint32_buffer.data(),
+ groups * n,
+ outputProcObj,
+ 0,
+ 1);
+
+ compare_validate_buffers(
+ Cint8_ref.data(),
+ Cint8_fb.data(),
+ m,
+ groups * n_adjusted,
+ groups * n,
+ static_cast<uint8_t>(0));
+ } // for each groups
+ } // for each shape
}
/**
@@ -230,180 +258,244 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
tie(atrans, btrans, test_ld) = GetParam();
for (auto shape : shapes) {
- int m = shape[0];
- int n = shape[1];
- int k = shape[2];
-
- aligned_vector<uint8_t> Aint8(m * k, 0);
- aligned_vector<int8_t> Bint8(k * n, 0);
- aligned_vector<int8_t> Bint8_ref(k * n, 0);
- aligned_vector<int32_t> Cint32_local(m * n, 0);
- aligned_vector<int32_t> Cint32_buffer(m * n, 0);
- aligned_vector<int32_t> Cint32_fb(m * n, 0);
- aligned_vector<uint8_t> Cint8_fb(m * n, 0);
- aligned_vector<uint8_t> Cint8_local(m * n, 0);
-
- randFill(Aint8, 0, 255);
- int32_t Aint8_zero_point = 43;
-
- randFill(Bint8, -128, 127);
-
- // To test lda != k , we just reduce k by half and use the original k
- // as lda.
- int k_adjusted = k;
- int n_adjusted = n;
- if (test_ld) {
- assert(
- atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
- k_adjusted = std::max(k / 2, 1);
- if (btrans == matrix_op_t::NoTranspose) {
- n_adjusted = std::max(n / 2, 1);
- }
- }
-
- int32_t Bint8_zero_point = -30;
- // computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(n_adjusted);
- col_offsets_with_zero_pt_s8acc32_ref(
- k_adjusted,
- n_adjusted,
- n,
- Bint8.data(),
- Bint8_zero_point,
- col_offsets.data());
-
- CompressedSparseColumn B_csc(k_adjusted, n_adjusted);
- float density = 0.001f;
- // deterministic random number
- default_random_engine eng;
- binomial_distribution<> per_col_nnz_dist(k_adjusted, density);
- uniform_int_distribution<> value_dist(
- numeric_limits<int8_t>::min() / 2, numeric_limits<int8_t>::max() / 2);
-
- vector<int> row_indices(k_adjusted);
- int total_nnz = 0;
- for (int j = 0; j < n_adjusted; ++j) {
- B_csc.ColPtr()[j] = total_nnz;
-
- int nnz_of_j = per_col_nnz_dist(eng);
- total_nnz += nnz_of_j;
-
- iota(row_indices.begin(), row_indices.end(), 0);
- shuffle(row_indices.begin(), row_indices.end(), eng);
- sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
-
- for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
- B_csc.RowIdx().push_back(row_indices[kidx]);
- // put the current B value
- B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
- // make current B value zero
- Bint8[row_indices[kidx] * n + j] = 0;
- }
- }
- B_csc.ColPtr()[n_adjusted] = total_nnz;
-
- for (auto i = 0; i < Bint8.size(); ++i) {
- Bint8_ref[i] = Bint8[i];
- }
-
- if (btrans == matrix_op_t::Transpose) {
- transpose_matrix(Bint8.data(), k, n);
- }
-
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
-
- float C_multiplier = 0.1234;
- int32_t C_zero_pt = 5;
-
- int brow = 256;
- matmul_u8i8acc16_ref(
- m,
- n_adjusted,
- k_adjusted,
- k,
- n,
- n,
- brow,
- Aint8.data(),
- Bint8_ref.data(),
- Cint32_local.data());
-
- bool accumulation = true;
- spmdm_ref(m, Aint8.data(), k, B_csc, accumulation, Cint32_local.data(), n);
-
- row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data());
-
- requantize_u8acc32_ref(
- m,
- n_adjusted,
- n,
- Cint32_local.data(),
- Cint8_local.data(),
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- row_offsets.data(),
- col_offsets.data(),
- nullptr);
-
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
- PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
-
- PackAWithRowOffset<uint8_t, int16_t> packAN(
- matrix_op_t::NoTranspose,
- m,
- k_adjusted,
- Aint8.data(),
- k,
- nullptr,
- 1,
- Aint8_zero_point,
- row_offset_buf.data());
-
- // spmdm -> requantization -> nothing
- // construct an output processing pipeline in reverse order
- // i.e. last output operation first
- // Last operation should always be DoNothing with
- // correct input and output type.
- DoNothing<> doNothingObj{};
- // The second last operation is requantization back
- // to int8
- ReQuantizeOutput<false> reqObj(
- doNothingObj,
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- packAN.getRowOffsetBuffer(),
- col_offsets.data(),
- nullptr);
- // the top most (first) operation in the output processing
- // pipeline is spmdm
- // outType = final output type after fullly processing through pipeline
- // inType = initial input type at the first call to the whole pipeline
- DoSpmdmOnInpBuffer<
- ReQuantizeOutput<false>::outType,
- int32_t,
- ReQuantizeOutput<false>>
- spmdmObj(reqObj, Aint8.data(), k, B_csc);
-
- PackBMatrix<int8_t, int16_t> packedB(
- btrans,
- k_adjusted,
- n_adjusted,
- Bint8.data(),
- (btrans == matrix_op_t::Transpose) ? k : n);
-
- fbgemmPacked(
- packAN, packedB, Cint8_fb.data(), Cint32_fb.data(), n, spmdmObj, 0, 1);
-
- compare_validate_buffers(
- Cint8_local.data(), Cint8_fb.data(), m, n, n, static_cast<uint8_t>(0));
- }
+ for (int groups : {1, 3, 4}) {
+ // very small density to test hyper sparsity case
+ // moderate density to test the implementation using transpose
+ for (float density : {0.0001f, 0.1f}) {
+ int m = shape[0];
+ int n = shape[1];
+ int k = shape[2];
+ if (k % groups != 0) {
+ continue;
+ }
+ int k_per_group = k / groups;
+
+ aligned_vector<uint8_t> Aint8(m * k, 0);
+
+ aligned_vector<int8_t> Bint8(k * n, 0);
+ aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+
+ aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
+ aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+ aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+
+ randFill(Aint8, 0, 255);
+ int32_t Aint8_zero_point = 43;
+
+ randFill(Bint8, -128, 127);
+
+ // To test lda != k , we just reduce k by half and use the original k
+ // as lda.
+ int n_adjusted = n;
+ if (test_ld) {
+ assert(
+ atrans == matrix_op_t::NoTranspose &&
+ "This case is not handled yet");
+ if (btrans == matrix_op_t::NoTranspose) {
+ n_adjusted = std::max(n / 2, 1);
+ }
+ }
+
+ int32_t Bint8_zero_point = -30;
+ // computing column offset
+ vector<int32_t> col_offsets;
+ col_offsets.resize(groups * n_adjusted);
+ for (int g = 0; g < groups; ++g) {
+ col_offsets_with_zero_pt_s8acc32_ref(
+ k_per_group,
+ n_adjusted,
+ n,
+ Bint8_ref.data() + g * k_per_group * n,
+ Bint8_zero_point,
+ col_offsets.data() + g * n_adjusted);
+ }
+
+ CompressedSparseColumn B_csc(k_per_group, groups * n_adjusted);
+ // Make sure density is big enough. Otherwise, we're not really testing
+ // spmdm.
+ // deterministic random number
+ default_random_engine eng;
+ binomial_distribution<> per_col_nnz_dist(k_per_group, density);
+
+ vector<int> row_indices(k_per_group);
+ int total_nnz = 0;
+ for (int g = 0; g < groups; ++g) {
+ for (int j = 0; j < n_adjusted; ++j) {
+ B_csc.ColPtr()[g * n_adjusted + j] = total_nnz;
+
+ int nnz_of_j = per_col_nnz_dist(eng);
+ total_nnz += nnz_of_j;
+
+ iota(row_indices.begin(), row_indices.end(), 0);
+ shuffle(row_indices.begin(), row_indices.end(), eng);
+ sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
+
+ for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
+ int rowidx = row_indices[kidx];
+ B_csc.RowIdx().push_back(rowidx);
+ int8_t* bptr = &Bint8[(g * k_per_group + rowidx) * n + j];
+ int b_remainder = 0;
+ if (kidx % 2 == 1) {
+ // Make sure abs(b_prev + *bptr - b_remainder) <= 128
+ int b_prev = B_csc.Values().back();
+ b_remainder = std::max(b_prev + *bptr - 128, b_remainder);
+ b_remainder = std::min(b_prev + *bptr + 128, b_remainder);
+ }
+ // put a portion of current B value that won't saturate
+ // _mm256_maddubs_epi16 .
+ B_csc.Values().push_back(*bptr - b_remainder);
+ // put the remainder
+ *bptr = b_remainder;
+ }
+ }
+ }
+ B_csc.ColPtr()[groups * n_adjusted] = total_nnz;
+
+ Bint8_ref = Bint8;
+
+ if (btrans == matrix_op_t::Transpose) {
+ aligned_vector<int8_t> Bint8_temp(Bint8.size());
+ for (int g = 0; g < groups; ++g) {
+ transpose_matrix(
+ k_per_group,
+ n,
+ Bint8.data() + g * k_per_group * n,
+ n,
+ Bint8_temp.data() + g * k_per_group,
+ groups * k_per_group);
+ }
+ Bint8 = Bint8_temp;
+ }
+
+ vector<int32_t> row_offsets;
+ row_offsets.resize(m);
+
+ float C_multiplier = 0.1234;
+ int32_t C_zero_pt = 5;
+
+ int brow = 256;
+ for (int g = 0; g < groups; ++g) {
+ matmul_u8i8acc16_ref(
+ m,
+ n_adjusted,
+ k_per_group,
+ k,
+ n,
+ groups * n,
+ brow,
+ Aint8.data() + g * k_per_group,
+ Bint8_ref.data() + g * k_per_group * n,
+ Cint32_ref.data() + g * n_adjusted);
+ }
+
+ bool accumulation = true;
+ spmdm_ref(
+ m,
+ Aint8.data(),
+ k,
+ B_csc,
+ accumulation,
+ Cint32_ref.data(),
+ groups * n,
+ groups);
+
+ for (int g = 0; g < groups; ++g) {
+ row_offsets_u8acc32_ref(
+ m,
+ k_per_group,
+ k,
+ Aint8.data() + g * k_per_group,
+ row_offsets.data());
+
+ requantize_u8acc32_ref(
+ m,
+ n_adjusted,
+ groups * n,
+ Cint32_ref.data() + g * n_adjusted,
+ Cint8_ref.data() + g * n_adjusted,
+ C_multiplier,
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point,
+ row_offsets.data(),
+ col_offsets.data() + g * n_adjusted,
+ nullptr);
+ }
+
+ vector<int32_t> row_offset_buf;
+ row_offset_buf.resize(
+ PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
+
+ PackAWithRowOffset<uint8_t, int16_t> packAN(
+ matrix_op_t::NoTranspose,
+ m,
+ k,
+ Aint8.data(),
+ k,
+ nullptr,
+ groups,
+ Aint8_zero_point,
+ row_offset_buf.data());
+
+ // spmdm -> requantization -> nothing
+ // construct an output processing pipeline in reverse order
+ // i.e. last output operation first
+ // Last operation should always be DoNothing with
+ // correct input and output type.
+ DoNothing<> doNothingObj{};
+ // The second last operation is requantization back
+ // to int8
+ ReQuantizeOutput<false> reqObj(
+ doNothingObj,
+ C_multiplier,
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point,
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr);
+ // the top most (first) operation in the output processing
+ // pipeline is spmdm
+ // outType = final output type after fullly processing through pipeline
+ // inType = initial input type at the first call to the whole pipeline
+ DoSpmdmOnInpBuffer<
+ ReQuantizeOutput<false>::outType,
+ int32_t,
+ ReQuantizeOutput<false>>
+ spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+ PackBMatrix<int8_t, int16_t> packedB(
+ btrans,
+ k,
+ n_adjusted,
+ Bint8.data(),
+ (btrans == matrix_op_t::Transpose) ? k : n,
+ nullptr,
+ groups,
+ Bint8_zero_point);
+
+ fbgemmPacked(
+ packAN,
+ packedB,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ groups * n,
+ spmdmObj,
+ 0,
+ 1);
+
+ compare_validate_buffers(
+ Cint8_ref.data(),
+ Cint8_fb.data(),
+ m,
+ groups * n_adjusted,
+ groups * n,
+ static_cast<uint8_t>(0));
+ } // for each density
+ } // for each groups
+ } // for each shape
}
/**
@@ -417,119 +509,140 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
tie(atrans, btrans, test_ld) = GetParam();
for (auto shape : shapes) {
- int m = shape[0];
- int n = shape[1];
- int k = shape[2];
-
- aligned_vector<uint8_t> Aint8(m * k, 0);
- aligned_vector<int8_t> Bint8(k * n, 0);
- aligned_vector<int8_t> Bint8_ref(k * n, 0);
- aligned_vector<int32_t> Cint32_local(m * n, 0);
- aligned_vector<int32_t> Cint32_buffer(m * n, 0);
- aligned_vector<int32_t> Cint32_fb(m * n, 0);
- aligned_vector<uint8_t> Cint8_fb(m * n, 0);
- aligned_vector<uint8_t> Cint8_local(m * n, 0);
-
- randFill(Aint8, 0, 255);
- int32_t Aint8_zero_point = 43;
-
- randFill(Bint8_ref, -128, 127);
-
- for (auto i = 0; i < Bint8.size(); ++i) {
- Bint8[i] = Bint8_ref[i];
- }
-
- if (btrans == matrix_op_t::Transpose) {
- transpose_matrix(Bint8.data(), k, n);
- }
-
- int32_t Bint8_zero_point = -30;
- // To test lda != k , we just reduce k by half and use the original k
- // as lda.
- int k_adjusted = k;
- int n_adjusted = n;
- if (test_ld) {
- assert(
- atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
- k_adjusted = std::max(k / 2, 1);
- if (btrans == matrix_op_t::NoTranspose) {
- n_adjusted = std::max(n / 2, 1);
+ for (int groups : {1, 3, 4}) {
+ int m = shape[0];
+ int n = shape[1];
+ int k = shape[2];
+ if (k % groups != 0) {
+ continue;
+ }
+ int k_per_group = k / groups;
+
+ aligned_vector<uint8_t> Aint8(m * k, 0);
+
+ aligned_vector<int8_t> Bint8(k * n, 0);
+ aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+
+ aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+
+ randFill(Aint8, 0, 255);
+ int32_t Aint8_zero_point = 43;
+
+ randFill(Bint8_ref, -128, 127);
+ Bint8 = Bint8_ref;
+
+ if (btrans == matrix_op_t::Transpose) {
+ aligned_vector<int8_t> Bint8_temp(Bint8.size());
+ for (int g = 0; g < groups; ++g) {
+ transpose_matrix(
+ k_per_group,
+ n,
+ Bint8.data() + g * k_per_group * n,
+ n,
+ Bint8_temp.data() + g * k_per_group,
+ groups * k_per_group);
+ }
+ Bint8 = Bint8_temp;
}
- }
-
- // computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(n_adjusted);
- col_offsets_with_zero_pt_s8acc32_ref(
- k_adjusted,
- n_adjusted,
- n,
- Bint8_ref.data(),
- Bint8_zero_point,
- col_offsets.data());
-
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
-
- int brow = 256;
- matmul_u8i8acc16_ref(
- m,
- n_adjusted,
- k_adjusted,
- k,
- n,
- n,
- brow,
- Aint8.data(),
- Bint8_ref.data(),
- Cint32_local.data());
-
- row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data());
-
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
- PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
-
- PackAWithRowOffset<uint8_t, int16_t> packAN(
- matrix_op_t::NoTranspose,
- m,
- k_adjusted,
- Aint8.data(),
- k,
- nullptr,
- 1,
- Aint8_zero_point,
- row_offset_buf.data());
-
- PackBMatrix<int8_t, int16_t> packedBN(
- btrans,
- k_adjusted,
- n_adjusted,
- Bint8.data(),
- (btrans == matrix_op_t::Transpose) ? k : n,
- nullptr,
- 1,
- Bint8_zero_point);
-
- // DoNothing<> doNothingObj{};
- DoNothing<int32_t, int32_t> doNothingObj{};
- memCopy<> outputProcObj(doNothingObj);
- fbgemmPacked(
- packAN,
- packedBN,
- Cint32_fb.data(),
- Cint32_buffer.data(),
- n,
- outputProcObj,
- 0,
- 1);
-
- compare_validate_buffers(
- Cint32_local.data(),
- Cint32_fb.data(),
- m,
- n,
- n,
- static_cast<int32_t>(0));
- }
+
+ int32_t Bint8_zero_point = -30;
+ // To test lda != k , we just reduce k by half and use the original k
+ // as lda.
+ int n_adjusted = n;
+ if (test_ld) {
+ assert(
+ atrans == matrix_op_t::NoTranspose &&
+ "This case is not handled yet");
+ if (btrans == matrix_op_t::NoTranspose) {
+ n_adjusted = std::max(n / 2, 1);
+ }
+ }
+
+ // computing column offset
+ vector<int32_t> col_offsets;
+ col_offsets.resize(groups * n_adjusted);
+ for (int g = 0; g < groups; ++g) {
+ col_offsets_with_zero_pt_s8acc32_ref(
+ k_per_group,
+ n_adjusted,
+ n,
+ Bint8_ref.data() + g * k_per_group * n,
+ Bint8_zero_point,
+ col_offsets.data() + g * n_adjusted);
+ }
+
+ vector<int32_t> row_offsets;
+ row_offsets.resize(m);
+
+ int brow = 256;
+ for (int g = 0; g < groups; ++g) {
+ matmul_u8i8acc16_ref(
+ m,
+ n_adjusted,
+ k_per_group,
+ k,
+ n,
+ groups * n,
+ brow,
+ Aint8.data() + g * k_per_group,
+ Bint8_ref.data() + g * k_per_group * n,
+ Cint32_ref.data() + g * n_adjusted);
+
+ row_offsets_u8acc32_ref(
+ m,
+ k_per_group,
+ k,
+ Aint8.data() + g * k_per_group,
+ row_offsets.data());
+ }
+
+ vector<int32_t> row_offset_buf;
+ row_offset_buf.resize(
+ PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
+
+ PackAWithRowOffset<uint8_t, int16_t> packAN(
+ matrix_op_t::NoTranspose,
+ m,
+ k,
+ Aint8.data(),
+ k,
+ nullptr,
+ groups,
+ Aint8_zero_point,
+ row_offset_buf.data());
+
+ PackBMatrix<int8_t, int16_t> packedBN(
+ btrans,
+ k,
+ n_adjusted,
+ Bint8.data(),
+ (btrans == matrix_op_t::Transpose) ? k : n,
+ nullptr,
+ groups,
+ Bint8_zero_point);
+
+ // DoNothing<> doNothingObj{};
+ DoNothing<int32_t, int32_t> doNothingObj{};
+ memCopy<> outputProcObj(doNothingObj);
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint32_fb.data(),
+ Cint32_buffer.data(),
+ groups * n,
+ outputProcObj,
+ 0,
+ 1);
+
+ compare_validate_buffers(
+ Cint32_ref.data(),
+ Cint32_fb.data(),
+ m,
+ groups * n_adjusted,
+ groups * n,
+ static_cast<int32_t>(0));
+ } // for each groups
+ } // for each shape
}
diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
index e0c9850..7653989 100644
--- a/test/PackedRequantizeTest.cc
+++ b/test/PackedRequantizeTest.cc
@@ -83,145 +83,191 @@ TEST_P(fbgemmu8s8acc32test, Test) {
tie(atrans, btrans, test_ld) = GetParam();
for (auto shape : shapes) {
- int m = shape[0];
- int n = shape[1];
- int k = shape[2];
-
- aligned_vector<uint8_t> Aint8(m * k, 0);
-
- // nxk matrix
- aligned_vector<int8_t> Bint8(k * n, 0);
- // kxn matrix
- aligned_vector<int8_t> Bint8_ref(k * n, 0);
-
- aligned_vector<int32_t> Cint32_ref(m * n, 0.0f);
- aligned_vector<int32_t> Cint32_fb(m * n, 0);
- aligned_vector<uint8_t> Cint8_fb(m * n, 0);
- aligned_vector<int32_t> Cint32_local(m * n, 0);
- aligned_vector<int32_t> Cint32_buffer(m * n, 0);
- aligned_vector<uint8_t> Cint8_local(m * n, 0);
-
- randFill(Aint8, 0, 255);
- int32_t Aint8_zero_point = 43;
-
- randFill(Bint8_ref, -128, 127);
- avoidOverflow(m, n, k, Aint8.data(), Bint8_ref.data());
-
- for (auto i = 0; i < Bint8.size(); ++i) {
- Bint8[i] = Bint8_ref[i];
- }
-
- if (btrans == matrix_op_t::Transpose) {
- transpose_matrix(Bint8.data(), k, n);
- }
-
- int32_t Bint8_zero_point = -30;
- // To test lda != k , we just reduce k by half and use the original k
- // as lda.
- int k_adjusted = k;
- int n_adjusted = n;
- if (test_ld) {
- assert(
- atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
- k_adjusted = std::max(k / 2, 1);
- if (btrans == matrix_op_t::NoTranspose) {
- n_adjusted = std::max(n / 2, 1);
- }
- }
-
- // computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(n_adjusted);
- col_offsets_with_zero_pt_s8acc32_ref(
- k_adjusted,
- n_adjusted,
- n,
- Bint8_ref.data(),
- Bint8_zero_point,
- col_offsets.data());
-
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
-
- float C_multiplier = 0.1234;
- int32_t C_zero_pt = 5;
-
- matmul_u8i8acc32_ref(
- m,
- n_adjusted,
- k_adjusted,
- k,
- n,
- n,
- Aint8.data(),
- Bint8_ref.data(),
- Cint32_local.data());
-
- row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data());
-
- requantize_u8acc32_ref(
- m,
- n_adjusted,
- n,
- Cint32_local.data(),
- Cint8_local.data(),
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- row_offsets.data(),
- col_offsets.data(),
- nullptr);
-
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
-
- PackAWithRowOffset<uint8_t> packAN(
- matrix_op_t::NoTranspose,
- m,
- k_adjusted,
- Aint8.data(),
- k,
- nullptr,
- 1,
- Aint8_zero_point,
- row_offset_buf.data());
-
- PackBMatrix<int8_t> packedBN(
- btrans,
- k_adjusted,
- n_adjusted,
- Bint8.data(),
- (btrans == matrix_op_t::Transpose) ? k : n,
- nullptr,
- 1,
- Bint8_zero_point);
-
- DoNothing<> doNothingObj{};
- ReQuantizeOutput<false> outputProcObj(
- doNothingObj,
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- packAN.getRowOffsetBuffer(),
- col_offsets.data(),
- nullptr);
-
- fbgemmPacked(
- packAN,
- packedBN,
- Cint8_fb.data(),
- Cint32_buffer.data(),
- n,
- outputProcObj,
- 0,
- 1);
- // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(),
- // m, n_adjusted, n, "C local");
- compare_validate_buffers(
- Cint8_local.data(), Cint8_fb.data(), m, n, n, static_cast<uint8_t>(0));
- }
+ for (int groups : {1, 3, 4}) {
+ for (bool test_bias: {false, true}) {
+ int m = shape[0];
+ int n = shape[1];
+ int k = shape[2];
+ if (k % groups != 0) {
+ continue;
+ }
+ int k_per_group = k / groups;
+
+ // mxk matrix
+ aligned_vector<uint8_t> Aint8(m * k, 0);
+
+ // kxn matrix
+ aligned_vector<int8_t> Bint8(k * n, 0);
+ aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+
+ aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
+ aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+ aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+
+ randFill(Aint8, 0, 255);
+ int32_t Aint8_zero_point = 43;
+
+ randFill(Bint8_ref, -128, 127);
+ for (int g = 0; g < groups; ++g) {
+ avoidOverflow(
+ m,
+ n,
+ k_per_group,
+ Aint8.data() + g * k_per_group,
+ k,
+ Bint8_ref.data() + g * k_per_group * n,
+ n);
+ }
+
+ Bint8 = Bint8_ref;
+
+ // initialize bias
+ aligned_vector<int32_t> bias_int32(groups * n);
+ int32_t* bias = nullptr;
+ if (test_bias) {
+ randFill(bias_int32, -128, 127);
+ bias = bias_int32.data();
+ }
+
+ if (btrans == matrix_op_t::Transpose) {
+ aligned_vector<int8_t> Bint8_temp(Bint8.size());
+ for (int g = 0; g < groups; ++g) {
+ transpose_matrix(
+ k_per_group,
+ n,
+ Bint8.data() + g * k_per_group * n,
+ n,
+ Bint8_temp.data() + g * k_per_group,
+ groups * k_per_group);
+ }
+ Bint8 = Bint8_temp;
+ }
+
+ int32_t Bint8_zero_point = -30;
+ // To test lda != k , we just reduce k by half and use the original k
+ // as lda.
+ int n_adjusted = n;
+ if (test_ld) {
+ assert(
+ atrans == matrix_op_t::NoTranspose &&
+ "This case is not handled yet");
+ if (btrans == matrix_op_t::NoTranspose) {
+ n_adjusted = std::max(n / 2, 1);
+ }
+ }
+
+ // computing column offset
+ vector<int32_t> col_offsets;
+ col_offsets.resize(groups * n_adjusted);
+ for (int g = 0; g < groups; ++g) {
+ col_offsets_with_zero_pt_s8acc32_ref(
+ k_per_group,
+ n_adjusted,
+ n,
+ Bint8_ref.data() + g * k_per_group * n,
+ Bint8_zero_point,
+ col_offsets.data() + g * n_adjusted);
+ }
+
+ vector<int32_t> row_offsets;
+ row_offsets.resize(m);
+
+ float C_multiplier = 0.001234;
+ int32_t C_zero_pt = 5;
+
+ for (int g = 0; g < groups; ++g) {
+ matmul_u8i8acc32_ref(
+ m,
+ n_adjusted,
+ k_per_group,
+ k,
+ n,
+ groups * n,
+ Aint8.data() + g * k_per_group,
+ Bint8_ref.data() + g * k_per_group * n,
+ Cint32_ref.data() + g * n_adjusted);
+
+ row_offsets_u8acc32_ref(
+ m,
+ k_per_group,
+ k,
+ Aint8.data() + g * k_per_group,
+ row_offsets.data());
+
+ requantize_u8acc32_ref(
+ m,
+ n_adjusted,
+ groups * n,
+ Cint32_ref.data() + g * n_adjusted,
+ Cint8_ref.data() + g * n_adjusted,
+ C_multiplier,
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point,
+ row_offsets.data(),
+ col_offsets.data() + g * n_adjusted,
+ bias ? (bias + g * n_adjusted) : nullptr);
+ }
+
+ vector<int32_t> row_offset_buf;
+ row_offset_buf.resize(
+ PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
+
+ PackAWithRowOffset<uint8_t> packAN(
+ matrix_op_t::NoTranspose,
+ m,
+ k,
+ Aint8.data(),
+ k,
+ nullptr,
+ groups,
+ Aint8_zero_point,
+ row_offset_buf.data());
+
+ PackBMatrix<int8_t> packedBN(
+ btrans,
+ k,
+ n_adjusted,
+ Bint8.data(),
+ (btrans == matrix_op_t::Transpose) ? k : n,
+ nullptr,
+ groups,
+ Bint8_zero_point);
+
+ DoNothing<> doNothingObj{};
+ ReQuantizeOutput<false> outputProcObj(
+ doNothingObj,
+ C_multiplier,
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point,
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ bias);
+
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint8_fb.data(),
+ Cint32_buffer.data(),
+ groups * n,
+ outputProcObj,
+ 0,
+ 1);
+ // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(),
+ // m, n_adjusted, n, "C local");
+ compare_validate_buffers(
+ Cint8_ref.data(),
+ Cint8_fb.data(),
+ m,
+ groups * n_adjusted,
+ groups * n,
+ static_cast<uint8_t>(0));
+ } // test_bias
+ } // for each groups
+ } // for each shape
}
/**
@@ -236,129 +282,163 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
tie(atrans, btrans, test_ld) = GetParam();
for (auto shape : shapes) {
- int m = shape[0];
- int n = shape[1];
- int k = shape[2];
-
- aligned_vector<float> Afp32(m * k, 0.0f);
- aligned_vector<uint8_t> Aint8(m * k, 0);
-
- aligned_vector<float> Bfp32(k * n, 0.0f);
- aligned_vector<int8_t> Bint8(k * n, 0);
-
- aligned_vector<float> Cfp32_ref(m * n, 0.0f);
- aligned_vector<float> Cfp32_fb(m * n, 0.0f);
-
- aligned_vector<uint8_t> Cint8_fb(m * n, 0);
- aligned_vector<int32_t> Cint32_buffer(m * n, 0);
-
- randFill(Aint8, 0, 255);
- int32_t Aint8_zero_point = 43;
- float Aint8_scale = 0.11;
- for (auto i = 0; i < Afp32.size(); ++i) {
- Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
- }
-
- randFill(Bint8, -128, 127);
- avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
- int32_t Bint8_zero_point = -30;
- float Bint8_scale = 0.49;
- for (auto i = 0; i < Bfp32.size(); ++i) {
- Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point);
- }
-
- // To test lda != k , we just reduce k by half and use the original k
- // as lda.
- int k_adjusted = k;
- int n_adjusted = n;
- if (test_ld) {
- assert(
- atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
- k_adjusted = std::max(k / 2, 1);
- if (btrans == matrix_op_t::NoTranspose) {
- n_adjusted = std::max(n / 2, 1);
+ for (int groups : {1, 3, 4}) {
+ int m = shape[0];
+ int n = shape[1];
+ int k = shape[2];
+ if (k % groups != 0) {
+ continue;
+ }
+ int k_per_group = k / groups;
+
+ aligned_vector<float> Afp32(m * k, 0.0f);
+ aligned_vector<uint8_t> Aint8(Afp32.size(), 0);
+
+ aligned_vector<float> Bfp32(k * n, 0.0f);
+ aligned_vector<int8_t> Bint8(Bfp32.size(), 0);
+
+ aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f);
+ aligned_vector<float> Cfp32_fb(Cfp32_ref.size(), 0.0f);
+
+ aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size(), 0);
+
+ randFill(Aint8, 0, 255);
+ int32_t Aint8_zero_point = 43;
+ float Aint8_scale = 0.11;
+ for (auto i = 0; i < Afp32.size(); ++i) {
+ Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
+ }
+
+ randFill(Bint8, -128, 127);
+ for (int g = 0; g < groups; ++g) {
+ avoidOverflow(
+ m,
+ n,
+ k_per_group,
+ Aint8.data() + g * k_per_group,
+ k,
+ Bint8.data() + g * k_per_group * n,
+ n);
+ }
+ int32_t Bint8_zero_point = -30;
+ float Bint8_scale = 0.49;
+ for (auto i = 0; i < Bfp32.size(); ++i) {
+ Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point);
+ }
+
+ // To test lda != k , we just reduce k by half and use the original k
+ // as lda.
+ int n_adjusted = n;
+ if (test_ld) {
+ assert(
+ atrans == matrix_op_t::NoTranspose &&
+ "This case is not handled yet");
+ if (btrans == matrix_op_t::NoTranspose) {
+ n_adjusted = std::max(n / 2, 1);
+ }
+ }
+
+ // computing column offset
+ vector<int32_t> col_offsets;
+ col_offsets.resize(groups * n_adjusted);
+ for (int g = 0; g < groups; ++g) {
+ col_offsets_with_zero_pt_s8acc32_ref(
+ k_per_group,
+ n_adjusted,
+ n,
+ Bint8.data() + g * k_per_group * n,
+ Bint8_zero_point,
+ col_offsets.data() + g * n_adjusted);
+ }
+
+ if (btrans == matrix_op_t::Transpose) {
+ aligned_vector<int8_t> Bint8_temp(Bint8.size());
+ for (int g = 0; g < groups; ++g) {
+ transpose_matrix(
+ k_per_group,
+ n,
+ Bint8.data() + g * k_per_group * n,
+ n,
+ Bint8_temp.data() + g * k_per_group,
+ groups * k_per_group);
+ }
+ Bint8 = Bint8_temp;
+ }
+
+ for (int g = 0; g < groups; ++g) {
+ matmul_fp_ref(
+ m,
+ n_adjusted,
+ k_per_group,
+ k,
+ n,
+ groups * n,
+ Afp32.data() + g * k_per_group,
+ Bfp32.data() + g * k_per_group * n,
+ Cfp32_ref.data() + g * n_adjusted);
}
- }
-
- // computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(n_adjusted);
- col_offsets_with_zero_pt_s8acc32_ref(
- k_adjusted,
- n_adjusted,
- n,
- Bint8.data(),
- Bint8_zero_point,
- col_offsets.data());
-
- if (btrans == matrix_op_t::Transpose) {
- transpose_matrix(Bint8.data(), k, n);
- }
-
- matmul_fp_ref(
- m,
- n_adjusted,
- k_adjusted,
- k,
- n,
- n,
- Afp32.data(),
- Bfp32.data(),
- Cfp32_ref.data());
-
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
- PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
-
- PackAWithQuantRowOffset<uint8_t> packAN(
- matrix_op_t::NoTranspose,
- m,
- k_adjusted,
- Afp32.data(),
- k,
- nullptr, /*buffer for packed matrix*/
- Aint8_scale,
- Aint8_zero_point,
- 1, /*groups*/
- row_offset_buf.data());
-
- PackBMatrix<int8_t> packedBN(
- btrans,
- k_adjusted,
- n_adjusted,
- Bint8.data(),
- (btrans == matrix_op_t::Transpose) ? k : n,
- nullptr,
- 1,
- Bint8_zero_point);
-
- DoNothing<float, float> doNothingObj{};
- ReQuantizeForFloat<false> outputProcObj(
- doNothingObj,
- Aint8_scale,
- Bint8_scale,
- Aint8_zero_point,
- Bint8_zero_point,
- packAN.getRowOffsetBuffer(),
- col_offsets.data(),
- nullptr);
-
- fbgemmPacked(
- packAN,
- packedBN,
- Cfp32_fb.data(),
- (int32_t*)Cfp32_fb.data(),
- n,
- outputProcObj,
- 0,
- 1);
-
- float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end());
- float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end());
- float atol = (maximum - minimum) / 255 / 1.9;
-
- compare_validate_buffers(Cfp32_ref.data(), Cfp32_fb.data(), m, n, n, atol);
- }
+
+ vector<int32_t> row_offset_buf;
+ row_offset_buf.resize(
+ PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
+
+ PackAWithQuantRowOffset<uint8_t> packAN(
+ matrix_op_t::NoTranspose,
+ m,
+ k,
+ Afp32.data(),
+ k,
+ nullptr, /*buffer for packed matrix*/
+ Aint8_scale,
+ Aint8_zero_point,
+ groups,
+ row_offset_buf.data());
+
+ PackBMatrix<int8_t> packedBN(
+ btrans,
+ k,
+ n_adjusted,
+ Bint8.data(),
+ (btrans == matrix_op_t::Transpose) ? k : n,
+ nullptr,
+ groups,
+ Bint8_zero_point);
+
+ DoNothing<float, float> doNothingObj{};
+ ReQuantizeForFloat<false> outputProcObj(
+ doNothingObj,
+ Aint8_scale,
+ Bint8_scale,
+ Aint8_zero_point,
+ Bint8_zero_point,
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr);
+
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cfp32_fb.data(),
+ reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+ groups * n,
+ outputProcObj,
+ 0,
+ 1);
+
+ float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end());
+ float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end());
+ float atol = (maximum - minimum) / 255 / 1.9;
+
+ compare_validate_buffers(
+ Cfp32_ref.data(),
+ Cfp32_fb.data(),
+ m,
+ groups * n_adjusted,
+ groups * n,
+ atol);
+ } // for each groups
+ } // for each shape
}
/**
@@ -373,253 +453,121 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
tie(atrans, btrans, test_ld) = GetParam();
for (auto shape : shapes) {
- int m = shape[0];
- int n = shape[1];
- int k = shape[2];
-
- aligned_vector<float> Afp32(m * k, 0.0f);
- aligned_vector<uint8_t> Aint8(m * k, 0);
-
- aligned_vector<float> Bfp32(k * n, 0.0f);
- aligned_vector<int8_t> Bint8(k * n, 0);
-
- aligned_vector<float> Cfp32_ref(m * n, 0.0f);
- aligned_vector<int32_t> Cint32_fb(m * n, 0);
-
- randFill(Afp32, 0, 255);
- for (auto i = 0; i < Afp32.size(); i++) {
- Aint8[i] = (uint8_t)Afp32[i];
- }
-
- // initialize B matrix
- randFill(Bfp32, -128, 127);
- avoidOverflow(m, n, k, Aint8.data(), Bfp32.data());
-
- for (auto i = 0; i < Bfp32.size(); ++i) {
- Bint8[i] = (int8_t)Bfp32[i];
- }
-
- // To test lda != k , we just reduce k by half and use the original k
- // as lda.
- int m_adjusted = m;
- int n_adjusted = n;
- int k_adjusted = k;
- if (test_ld) {
- assert(
- atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
- k_adjusted = std::max(k / 2, 1);
- if (btrans == matrix_op_t::NoTranspose) {
- n_adjusted = std::max(n / 2, 1);
- }
- }
-
- if (btrans == matrix_op_t::Transpose) {
- transpose_matrix(Bint8.data(), k, n);
- }
-
- matmul_fp_ref(
- m,
- n_adjusted,
- k_adjusted,
- k,
- n,
- n,
- Afp32.data(),
- Bfp32.data(),
- Cfp32_ref.data());
-
- DoNothing<int32_t, int32_t> doNothingObj{};
- memCopy<> outputProcObj(doNothingObj);
- // A zero point and row offset not required
- PackAMatrix<uint8_t> packAN(
- matrix_op_t::NoTranspose, m, k_adjusted, Aint8.data(), k);
-
- // B zero point defaults to 0
- PackBMatrix<int8_t> packedBN(
- btrans,
- k_adjusted,
- n_adjusted,
- Bint8.data(),
- (btrans == matrix_op_t::Transpose) ? k : n);
-
- fbgemmPacked(
- packAN,
- packedBN,
- Cint32_fb.data(),
- Cint32_fb.data(),
- n,
- outputProcObj,
- 0,
- 1);
-
- // correctness check
- for (int i = 0; i < m_adjusted; ++i) {
- for (int j = 0; j < n_adjusted; ++j) {
- float expected = Cfp32_ref[i * n + j];
- int32_t actual = Cint32_fb[i * n + j];
- EXPECT_EQ(expected, actual)
- << "GEMM results differ at (" << i << ", " << j << "). ref "
- << expected << " FBGemm " << actual;
+ for (int groups : {1, 3, 4}) {
+ int m = shape[0];
+ int n = shape[1];
+ int k = shape[2];
+ if (k % groups != 0) {
+ continue;
}
- }
- }
-}
+ int k_per_group = k / groups;
-/**
- * @brief Unit test for unt8 matrix A, int8 matrix B, and 32-bit
- * accumulation. Output processing: requantization with bias -> nothing.
- * Asymmetric: the zero point is not 0.
- */
-TEST_P(fbgemmu8s8acc32test, TestAsymmetricQuantizedWithBias) {
- vector<vector<int>> shapes(GetShapes_());
- matrix_op_t atrans, btrans;
- bool test_ld;
- tie(atrans, btrans, test_ld) = GetParam();
+ aligned_vector<float> Afp32(m * k, 0.0f);
+ aligned_vector<uint8_t> Aint8(Afp32.size(), 0);
- for (auto shape : shapes) {
- int m = shape[0];
- int n = shape[1];
- int k = shape[2];
+ aligned_vector<float> Bfp32(k * n, 0.0f);
+ aligned_vector<int8_t> Bint8(Bfp32.size(), 0);
- aligned_vector<uint8_t> Aint8(m * k, 0);
- aligned_vector<uint8_t> Aint8_ref(m * k, 0);
+ aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f);
+ aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size(), 0);
- aligned_vector<int8_t> Bint8(k * n, 0);
- aligned_vector<int8_t> Bint8_ref(k * n, 0);
+ randFill(Afp32, 0, 255);
+ for (auto i = 0; i < Afp32.size(); i++) {
+ Aint8[i] = (uint8_t)Afp32[i];
+ }
- aligned_vector<int32_t> Cint32_fb(m * n, 0);
- aligned_vector<int32_t> Cint32_ref(m * n, 0);
+ // initialize B matrix
+ randFill(Bfp32, -128, 127);
+ for (int g = 0; g < groups; ++g) {
+ avoidOverflow(
+ m,
+ n,
+ k_per_group,
+ Aint8.data() + g * k_per_group,
+ k,
+ Bfp32.data() + g * k_per_group * n,
+ n);
+ }
- aligned_vector<uint8_t> Cint8_fb(m * n, 0);
- aligned_vector<uint8_t> Cint8_ref(m * n, 0);
+ for (auto i = 0; i < Bfp32.size(); ++i) {
+ Bint8[i] = (int8_t)Bfp32[i];
+ }
- int n_adjusted = n;
- int k_adjusted = k;
+ // To test lda != k , we just reduce k by half and use the original k
+ // as lda.
+ int n_adjusted = n;
+ if (test_ld) {
+ assert(
+ atrans == matrix_op_t::NoTranspose &&
+ "This case is not handled yet");
+ if (btrans == matrix_op_t::NoTranspose) {
+ n_adjusted = std::max(n / 2, 1);
+ }
+ }
+
+ if (btrans == matrix_op_t::Transpose) {
+ aligned_vector<int8_t> Bint8_temp(Bint8.size());
+ for (int g = 0; g < groups; ++g) {
+ transpose_matrix(
+ k_per_group,
+ n,
+ Bint8.data() + g * k_per_group * n,
+ n,
+ Bint8_temp.data() + g * k_per_group,
+ groups * k_per_group);
+ }
+ Bint8 = Bint8_temp;
+ }
+
+ for (int g = 0; g < groups; ++g) {
+ matmul_fp_ref(
+ m,
+ n_adjusted,
+ k_per_group,
+ k,
+ n,
+ groups * n,
+ Afp32.data() + g * k_per_group,
+ Bfp32.data() + g * k_per_group * n,
+ Cfp32_ref.data() + g * n_adjusted);
+ }
- if (test_ld) {
- assert(
- atrans == matrix_op_t::NoTranspose && "This case is not handled yet");
- k_adjusted = std::max(k / 2, 1);
- if (btrans == matrix_op_t::NoTranspose) {
- n_adjusted = std::max(n / 2, 1);
+ DoNothing<int32_t, int32_t> doNothingObj{};
+ memCopy<> outputProcObj(doNothingObj);
+ // A zero point and row offset not required
+ PackAMatrix<uint8_t> packAN(
+ matrix_op_t::NoTranspose, m, k, Aint8.data(), k, nullptr, groups);
+
+ // B zero point defaults to 0
+ PackBMatrix<int8_t> packedBN(
+ btrans,
+ k,
+ n_adjusted,
+ Bint8.data(),
+ (btrans == matrix_op_t::Transpose) ? k : n,
+ nullptr,
+ groups);
+
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint32_fb.data(),
+ Cint32_fb.data(),
+ groups * n,
+ outputProcObj,
+ 0,
+ 1);
+
+ // correctness check
+ for (int i = 0; i < m; ++i) {
+ for (int j = 0; j < groups * n_adjusted; ++j) {
+ float expected = Cfp32_ref[i * groups * n + j];
+ int32_t actual = Cint32_fb[i * groups * n + j];
+ EXPECT_EQ(expected, actual)
+ << "GEMM results differ at (" << i << ", " << j << "). ref "
+ << expected << " FBGemm " << actual;
+ }
}
- }
-
- // A and B have scale 1, so exactly represented after quantization
- randFill(Aint8, 0, 255);
- randFill(Bint8, -128, 127);
- avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
-
- for (auto i = 0; i < Bint8.size(); ++i) {
- Bint8_ref[i] = Bint8[i];
- }
-
- for (auto i = 0; i < Aint8.size(); ++i) {
- Aint8_ref[i] = Aint8[i];
- }
-
- int32_t Aint8_zero_point = 55;
- int32_t Bint8_zero_point = -17;
-
- // initialize bias
- aligned_vector<int32_t> bias_int32(n);
- randFill(bias_int32, -128, 127);
-
- if (btrans == matrix_op_t::Transpose) {
- transpose_matrix(Bint8.data(), k, n);
- }
-
- // computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(n_adjusted);
- col_offsets_with_zero_pt_s8acc32_ref(
- k_adjusted,
- n_adjusted,
- n,
- Bint8_ref.data(),
- Bint8_zero_point,
- col_offsets.data());
-
- matmul_u8i8acc32_ref(
- m,
- n_adjusted,
- k_adjusted,
- k,
- n,
- n,
- Aint8.data(),
- Bint8_ref.data(),
- Cint32_ref.data());
-
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
-
- row_offsets_u8acc32_ref(
- m, k_adjusted, k, Aint8_ref.data(), row_offsets.data());
-
- float C_multiplier = 0.1234;
- int32_t C_zero_pt = 5;
-
- requantize_u8acc32_ref(
- m,
- n_adjusted,
- n,
- Cint32_ref.data(),
- Cint8_ref.data(),
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- row_offsets.data(),
- col_offsets.data(),
- bias_int32.data());
-
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
-
- PackAWithRowOffset<uint8_t> packAN(
- matrix_op_t::NoTranspose,
- m,
- k_adjusted,
- Aint8.data(),
- k,
- nullptr,
- 1,
- Aint8_zero_point,
- row_offset_buf.data());
-
- PackBMatrix<int8_t> packedBN(
- btrans,
- k_adjusted,
- n_adjusted,
- Bint8.data(),
- (btrans == matrix_op_t::Transpose) ? k : n,
- nullptr,
- 1,
- Bint8_zero_point);
-
- DoNothing<> doNothingObj{};
- ReQuantizeOutput<false> outputProcObj(
- doNothingObj,
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- packAN.getRowOffsetBuffer(),
- col_offsets.data(),
- bias_int32.data());
-
- fbgemmPacked(
- packAN,
- packedBN,
- Cint8_fb.data(),
- Cint32_fb.data(),
- n,
- outputProcObj,
- 0,
- 1);
-
- compare_validate_buffers(
- Cint8_fb.data(), Cint8_ref.data(), m, n, n, static_cast<uint8_t>(0));
- }
+ } // for each groups
+ } // for each shape
}
diff --git a/test/QuantizationHelpers.cc b/test/QuantizationHelpers.cc
index eab08de..cb3edd3 100644
--- a/test/QuantizationHelpers.cc
+++ b/test/QuantizationHelpers.cc
@@ -17,12 +17,19 @@ namespace fbgemm {
* @brief Make sure we won't have overflows from vpmaddubsw instruction.
*/
template <typename T>
-void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B) {
+void avoidOverflow(
+ int m,
+ int n,
+ int k,
+ const uint8_t* Aint8,
+ int lda,
+ T* B,
+ int ldb) {
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
for (int kk = 0; kk < k / 2 * 2; kk += 2) {
- int a0 = Aint8[i * k + kk], a1 = Aint8[i * k + kk + 1];
- int b0 = B[kk * n + j], b1 = B[(kk + 1) * n + j];
+ int a0 = Aint8[i * lda + kk], a1 = Aint8[i * lda + kk + 1];
+ int b0 = B[kk * ldb + j], b1 = B[(kk + 1) * ldb + j];
int sum_pair = a0 * b0 + a1 * b1;
if (sum_pair < numeric_limits<int16_t>::lowest()) {
int b1_adjusted =
@@ -43,13 +50,23 @@ void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B) {
assert(
new_sum_pair >= numeric_limits<int16_t>::lowest() &&
new_sum_pair <= numeric_limits<int16_t>::max());
- B[(kk + 1) * n + j] = b1_adjusted;
+ B[(kk + 1) * ldb + j] = b1_adjusted;
}
}
} // for each j
} // for each i
}
+template <typename T>
+void avoidOverflow(
+ int m,
+ int n,
+ int k,
+ const uint8_t* Aint8,
+ T* B) {
+ return avoidOverflow(m, n, k, Aint8, k, B, n);
+}
+
template void
avoidOverflow(int m, int n, int k, const uint8_t* Aint8, int8_t* B);
template void
diff --git a/test/QuantizationHelpers.h b/test/QuantizationHelpers.h
index 42c3e08..d645a4e 100644
--- a/test/QuantizationHelpers.h
+++ b/test/QuantizationHelpers.h
@@ -13,6 +13,16 @@ namespace fbgemm {
* @brief Make sure we won't have overflows from vpmaddubsw instruction.
*/
template <typename T>
+void avoidOverflow(
+ int m,
+ int n,
+ int k,
+ const uint8_t* Aint8,
+ int lda,
+ T* B,
+ int ldb);
+
+template <typename T>
void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B);
} // namespace fbgemm
diff --git a/test/TestUtils.cc b/test/TestUtils.cc
index 5cc14ef..3200246 100644
--- a/test/TestUtils.cc
+++ b/test/TestUtils.cc
@@ -79,18 +79,25 @@ template bool
check_all_zero_entries<uint8_t>(const uint8_t* test, int m, int n);
template <typename T>
-void transpose_matrix(T* ref, int n, int k) {
- aligned_vector<T> local(n * k, 0);
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < k; ++j) {
- local[j * n + i] = ref[i * k + j];
+void transpose_matrix(
+ int M,
+ int N,
+ const T* src,
+ int ld_src,
+ T* dst,
+ int ld_dst) {
+ for (int i = 0; i < N; ++i) {
+ for (int j = 0; j < M; ++j) {
+ dst[i * ld_dst + j] = src[i + j * ld_src];
}
- }
- for (int i = 0; i < k; ++i) {
- for (int j = 0; j < n; ++j) {
- ref[i * n + j] = local[i * n + j];
- }
- }
+ } // for each output row
+}
+
+template <typename T>
+void transpose_matrix(T* ref, int n, int k) {
+ std::vector<T> local(n * k);
+ transpose_matrix(n, k, ref, k, local.data(), n);
+ memcpy(ref, local.data(), n * k * sizeof(T));
}
template void transpose_matrix<float>(float* ref, int n, int k);
diff --git a/test/TestUtils.h b/test/TestUtils.h
index 6cc365f..2cb7b88 100644
--- a/test/TestUtils.h
+++ b/test/TestUtils.h
@@ -32,9 +32,24 @@ bool check_all_zero_entries(const T* test, int m, int n);
/*
* @brief In-place transposition for nxk matrix ref.
- * @params n number of rows in output (number of columns in input)
- * @params k number of columns in output (number of rows in input)
+ * @params n number of rows in input (number of columns in output)
+ * @params k number of columns in input (number of rows in output)
*/
template <typename T>
void transpose_matrix(T* ref, int n, int k);
+
+/*
+ * @brief Out-of-place transposition for M*N matrix ref.
+ * @params M number of rows in input
+ * @params K number of columns in input
+ */
+template <typename T>
+void transpose_matrix(
+ int M,
+ int N,
+ const T* src,
+ int ld_src,
+ T* dst,
+ int ld_dst);
+
} // namespace fbgemm