diff options
-rw-r--r-- | bench/Im2ColFusedRequantizeAcc16Benchmark.cc | 6 | ||||
-rw-r--r-- | bench/Im2ColFusedRequantizeAcc32Benchmark.cc | 6 | ||||
-rw-r--r-- | include/fbgemm/Fbgemm.h | 53 | ||||
-rw-r--r-- | include/fbgemm/OutputProcessing-inl.h | 5 | ||||
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 55 | ||||
-rw-r--r-- | src/Fbgemm.cc | 73 | ||||
-rw-r--r-- | src/PackAMatrix.cc | 21 | ||||
-rw-r--r-- | src/PackAWithIm2Col.cc | 6 | ||||
-rw-r--r-- | src/PackBMatrix.cc | 70 | ||||
-rw-r--r-- | src/PackMatrix.cc | 3 | ||||
-rw-r--r-- | src/PackWithQuantRowOffset.cc | 15 | ||||
-rw-r--r-- | src/PackWithRowOffset.cc | 15 | ||||
-rw-r--r-- | src/RefImplementations.cc | 34 | ||||
-rw-r--r-- | src/RefImplementations.h | 7 | ||||
-rw-r--r-- | test/Im2ColFusedRequantizeTest.cc | 24 | ||||
-rw-r--r-- | test/PackedRequantizeAcc16Test.cc | 957 | ||||
-rw-r--r-- | test/PackedRequantizeTest.cc | 948 | ||||
-rw-r--r-- | test/QuantizationHelpers.cc | 25 | ||||
-rw-r--r-- | test/QuantizationHelpers.h | 10 | ||||
-rw-r--r-- | test/TestUtils.cc | 29 | ||||
-rw-r--r-- | test/TestUtils.h | 19 |
21 files changed, 1298 insertions, 1083 deletions
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc index 48b744a..8827b4c 100644 --- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc +++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc @@ -189,7 +189,11 @@ void performance_test() { PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize()); PackAWithIm2Col<uint8_t, int16_t> packA( - conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data()); + conv_p, + Aint8.data(), + nullptr, + Aint8_zero_point, + row_offset_buf.data()); PackBMatrix<int8_t, int16_t> packedB( matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc index 9201e52..b87f7d7 100644 --- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc +++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc @@ -191,7 +191,11 @@ void performance_test() { PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize()); PackAWithIm2Col<uint8_t, int32_t> packA( - conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data()); + conv_p, + Aint8.data(), + nullptr, + Aint8_zero_point, + row_offset_buf.data()); PackBMatrix<int8_t, int32_t> packedB( matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index bd1c86e..d05e099 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -20,6 +20,10 @@ #include "Types.h" #include "Utils.h" +// Turning on this option will print out time breakdown of each stage (e.g., +// input packing, the main GEMM kernel, each output processing pipeline). +// Please note that currently this option won't report accurate timing if +// multiple threads are used. // #define FBGEMM_MEASURE_TIME_BREAKDOWN #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN @@ -80,13 +84,22 @@ class PackMatrix { * overhead of internal memory allocation everytime a PackMatrix * is constructed. The client code can query how big patm should * be with packedBufferSize function. + * @param groups when groups > 1, we compute groups number of GEMMs each + * multiplies A.rows by A.cols/A.groups matrix with + * B.rows/B.groups by B.cols matrix (in conventional BLAS + * terminology, this is a batched GEMM but we use the name group + * to follow deep learning terminology). The result matrix has + * dimension A.rows by B.cols*B.groups . + * A.groups must be same as B.groups, A.groups must divide + * A.cols, and B.groups must divide B.rows and C.cols. * @param zero_pt the quantized value that maps to 0.0f floating-point number. */ PackMatrix( std::int32_t rows, std::int32_t cols, inpType* pmat, - std::int32_t zero_pt); + int groups = 1, + std::int32_t zero_pt = 0); /** * @return true usually when the matrix is constant matrix (e.g., weight @@ -228,6 +241,10 @@ class PackMatrix { return last_bcol_; } + int numGroups() const { + return G_; + } + /** * @return True if the last column block has fewer columns than the block * size. @@ -275,6 +292,7 @@ class PackMatrix { private: std::int32_t nrows_, ncols_; + int G_; std::int32_t zero_pt_; block_type_t packedBlock_; ///< The block in the source matrix just packed std::int32_t last_brow_, last_bcol_; @@ -295,9 +313,6 @@ class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> { PackAMatrix() = delete; // no default constructor - /** - * TODO: currently only groups == 1 supported - */ PackAMatrix( matrix_op_t trans, std::int32_t nRow, @@ -305,7 +320,7 @@ class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> { const inpType* smat, std::int32_t ld, inpType* pmat = nullptr, - std::int32_t groups = 1, + int groups = 1, std::int32_t zero_pt = 0); /** @@ -351,7 +366,6 @@ class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> { matrix_op_t trans_; const T* smat_; std::int32_t ld_; - std::int32_t G_; std::int32_t row_interleave_B_; }; @@ -370,9 +384,6 @@ class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> { PackBMatrix() = delete; // no default constructor - /** - * TODO: Currently only groups == 1 supported. - */ PackBMatrix( matrix_op_t trans, std::int32_t nRow, @@ -380,7 +391,7 @@ class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> { const inpType* smat, std::int32_t ld, inpType* pmat = nullptr, - std::int32_t groups = 1, + int groups = 1, std::int32_t zero_pt = 0); /** @@ -437,7 +448,6 @@ class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> { matrix_op_t trans_; const T* smat_; std::int32_t ld_; - std::int32_t G_; std::int32_t row_interleave_; }; @@ -533,9 +543,6 @@ class PackAWithRowOffset final using accType = accT; PackAWithRowOffset() = delete; // no default constructor - /** - * TODO: Currently only groups == 1 supported - */ PackAWithRowOffset( matrix_op_t trans, std::uint32_t nRow, @@ -543,7 +550,7 @@ class PackAWithRowOffset final const T* smat, std::uint32_t ld, inpType* pmat = nullptr, - std::uint32_t groups = 1, + int groups = 1, std::int32_t zero_pt = 0, std::int32_t* row_offset = nullptr); @@ -600,7 +607,6 @@ class PackAWithRowOffset final matrix_op_t trans_; const T* smat_; std::uint32_t ld_; - std::uint32_t G_; std::int32_t* row_offset_; bool rowOffsetAllocatedHere; std::int32_t row_interleave_B_; @@ -621,9 +627,6 @@ class PackAWithQuantRowOffset final using accType = accT; PackAWithQuantRowOffset() = delete; // no default constructor - /** - * TODO: Currently only groups == 1 supported - */ PackAWithQuantRowOffset( matrix_op_t trans, std::int32_t nRow, @@ -633,7 +636,7 @@ class PackAWithQuantRowOffset final inpType* pmat = nullptr, float scale = 1.0f, std::int32_t zero_pt = 0, - std::int32_t groups = 1, + int groups = 1, std::int32_t* row_offset = nullptr); /** @@ -690,7 +693,6 @@ class PackAWithQuantRowOffset final const float* smat_; std::int32_t ld_; float scale_; - std::int32_t G_; std::int32_t* row_offset_; bool rowOffsetAllocatedHere; std::int32_t row_interleave_B_; @@ -811,7 +813,8 @@ class ReluOutput { * processing pipeline. * * SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer - * (inp). After modifying the input buffer, pass it to the next op + * (inp). After modifying the input buffer, pass it to the next op. + * When groups > 1, each group is numRows() x (numCols()/groups) matrix. */ template < typename outT = std::int32_t, @@ -825,8 +828,9 @@ class DoSpmdmOnInpBuffer { nextOPType& nextop, const std::uint8_t* A, int lda, - const CompressedSparseColumn& B_csc) - : nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc) {} + const CompressedSparseColumn& B_csc, + int groups = 1) + : nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc), groups_(groups) {} template <inst_set_t instSet> inline int f( @@ -841,6 +845,7 @@ class DoSpmdmOnInpBuffer { const std::uint8_t* A_; const int lda_; const CompressedSparseColumn& B_csc_; + const int groups_; }; /** diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h index 13f614a..71d8046 100644 --- a/include/fbgemm/OutputProcessing-inl.h +++ b/include/fbgemm/OutputProcessing-inl.h @@ -28,7 +28,10 @@ template <typename outT, typename inT, typename nextOPType> template<inst_set_t instSet> inline int DoSpmdmOnInpBuffer<outT, inT, nextOPType>::f(outT* out, inT* inp, const block_type_t& block, int ld_out, int ld_in) const { - B_csc_.SpMDM(block, A_, lda_, true, inp, ld_in); + assert(B_csc_.NumOfCols() % groups_ == 0); + int n_per_group = B_csc_.NumOfCols() / groups_; + int g = block.col_start / n_per_group; + B_csc_.SpMDM(block, A_ + g * B_csc_.NumOfRows(), lda_, true, inp, ld_in); return nextop_.template f<instSet>(out, inp, block, ld_out, ld_in); } diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index b3f8c15..c2079b1 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -8,7 +8,6 @@ #include <cpuinfo.h> #include <chrono> - #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN double kernel_time = 0.0; double postprocessing_time = 0.0; @@ -84,8 +83,10 @@ void ExecuteKernel< int32_t packed_rows_A = packedA_.numPackedRows(); int32_t row_start_A = packedA_.packedRowStart(); - bool lastKBlock = packedB_.isThisLastKBlock(kBlock); - bool accum = kBlock > 0; + int group = kBlock / packedB_.blockRows(); + int NDim = packedB_.numCols(); + bool lastKBlock = packedB_.isThisLastKBlock(kBlock % packedB_.blockRows()); + bool accum = (kBlock % packedB_.blockRows()) > 0; typename BaseType::jit_micro_kernel_fp fn; @@ -120,7 +121,6 @@ void ExecuteKernel< #endif for (int jb = 0; jb < bColBlocks; ++jb) { - bBuf = packedB_.getBuf(jb, kBlock); // prefetch addr of the next packed block of B matrix bBuf_pf = packedB_.getBuf(jb == bColBlocks - 1 ? jb : jb + 1, kBlock); @@ -128,12 +128,14 @@ void ExecuteKernel< // Reuse the first rowblock of C_buffer_ unless when C_buffer_ is same as // matC_ (inplace output processing) int32_t* C_buffer_row_start = C_buffer_ + - ((C_buffer_ == reinterpret_cast<int32_t*>(matC_)) ? row_start_A * ldc_ - : 0); + ((C_buffer_ == reinterpret_cast<int32_t*>(matC_)) + ? row_start_A * ldc_ + NDim * group + : 0); int32_t* C_buffer_start = C_buffer_row_start + jb * nbSize_; int32_t leadingDim = ldc_; if (packedB_.isThereColRemainder() && (jb == bColBlocks - 1)) { - // In case we will access memory past C_buffer_, we use C_tile_ instead. + // In case we will access memory past C_buffer_, we use C_tile_ scratchpad + // instead. C_buffer_start = C_tile_; leadingDim = nbSize_; } @@ -146,14 +148,15 @@ void ExecuteKernel< leadingDim); #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) - .count(); - kernel_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); + t_end = std::chrono::high_resolution_clock::now(); + dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) + .count(); + kernel_time += (dt); + t_start = std::chrono::high_resolution_clock::now(); #endif - // Output processing is done only once per rowblock + // Output processing is done only once per rowblock to amortize overhead + // and for better spatial locality. if (lastKBlock && jb == bColBlocks - 1) { // When C_tile_ is used for the last column block, we need a separate // handling for the last column block. @@ -166,14 +169,14 @@ void ExecuteKernel< outputProcess_.template f<inst_set_t::avx2>( matC_, C_buffer_row_start, - {row_start_A, packed_rows_A, 0, nSize}, + {row_start_A, packed_rows_A, NDim * group, nSize}, ldc_, ldc_); } else if (cpuinfo_has_x86_avx2()) { outputProcess_.template f<inst_set_t::avx2>( matC_, C_buffer_row_start, - {row_start_A, packed_rows_A, 0, nSize}, + {row_start_A, packed_rows_A, NDim * group, nSize}, ldc_, ldc_); } else { @@ -183,20 +186,28 @@ void ExecuteKernel< } if (C_buffer_start == C_tile_) { + // When C_tile_ scratchpad was used to avoid accessing memory past + // C_buffer_ . if (cpuinfo_has_x86_avx512f()) { // TODO: avx512 path // Currently use avx2 code outputProcess_.template f<inst_set_t::avx2>( matC_, C_tile_, - {row_start_A, packed_rows_A, jb * nbSize_, packedB_.lastBcol()}, + {row_start_A, + packed_rows_A, + NDim * group + jb * nbSize_, + packedB_.lastBcol()}, ldc_, leadingDim); } else if (cpuinfo_has_x86_avx2()) { outputProcess_.template f<inst_set_t::avx2>( matC_, C_tile_, - {row_start_A, packed_rows_A, jb * nbSize_, packedB_.lastBcol()}, + {row_start_A, + packed_rows_A, + NDim * group + jb * nbSize_, + packedB_.lastBcol()}, ldc_, leadingDim); } else { @@ -207,11 +218,11 @@ void ExecuteKernel< } // output processing #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) - .count(); - postprocessing_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); + t_end = std::chrono::high_resolution_clock::now(); + dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) + .count(); + postprocessing_time += (dt); + t_start = std::chrono::high_resolution_clock::now(); #endif } // for each j block diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index f8f0d34..8376077 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -76,23 +76,31 @@ void fbgemmPacked( throw std::runtime_error("Failed to initialize cpuinfo!"); } + if (!packB.isPrePacked()) { + throw std::runtime_error("B matrix must be prepacked"); + } + if (packA.numGroups() != packB.numGroups()) { + throw std::runtime_error( + "A.groups = " + std::to_string(packA.numGroups()) + " and B.groups = " + + std::to_string(packB.numGroups()) + " are not the same"); + } + int MDim = packA.numRows(); int KDim = packB.numRows(); + int KDimPerGroup = packB.numRows() / packB.numGroups(); + int NDim = packB.numCols(); int mBlocks = (MDim + MCB - 1) / MCB; - int kBlocks = (KDim + KCB - 1) / KCB; + int kBlocks = (KDimPerGroup + KCB - 1) / KCB; // remainders int _mc = MDim % MCB; - int _kc = KDim % KCB; + int _kc = KDimPerGroup % KCB; int kc, mc; block_type_t blockA{0, 0, 0, 0}; - // B must be prepacked - assert(packB.isPrePacked() && "B matrix must be prepacked"); - #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN std::chrono::time_point<std::chrono::high_resolution_clock> t_very_start, t_start, t_end; @@ -101,37 +109,48 @@ void fbgemmPacked( t_very_start = std::chrono::high_resolution_clock::now(); #endif - ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType> - exeKernelObj(packA, packB, 0, C, C_buffer, ldc, outProcess); // ToDo: thread based work division - for (int i = 0; i < mBlocks; ++i) { - mc = (i != mBlocks - 1 || _mc == 0) ? MCB : _mc; - for (int k = 0; k < kBlocks; ++k) { - kc = (k != kBlocks - 1 || _kc == 0) ? KCB : _kc; - // pack A matrix - blockA = {i * MCB, mc, k * KCB, kc}; - - packA.pack(blockA); + for (int g = 0; g < packA.numGroups(); ++g) { + ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType> + exeKernelObj( + packA, + packB, + 0, + C, + C_buffer, + ldc, + outProcess); + for (int i = 0; i < mBlocks; ++i) { + mc = (i != mBlocks - 1 || _mc == 0) ? MCB : _mc; + for (int k = 0; k < kBlocks; ++k) { + kc = (k != kBlocks - 1 || _kc == 0) ? KCB : _kc; + // pack A matrix + blockA = {i * MCB, mc, g * KDimPerGroup + k * KCB, kc}; + + packA.pack(blockA); #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) - .count(); - packing_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); + t_end = std::chrono::high_resolution_clock::now(); + dt = std::chrono::duration_cast<std::chrono::nanoseconds>( + t_end - t_start) + .count(); + packing_time += (dt); + t_start = std::chrono::high_resolution_clock::now(); #endif - exeKernelObj.execute(k); + exeKernelObj.execute(g * kBlocks + k); #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - t_end = std::chrono::high_resolution_clock::now(); - dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) - .count(); - computing_time += (dt); - t_start = std::chrono::high_resolution_clock::now(); + t_end = std::chrono::high_resolution_clock::now(); + dt = std::chrono::duration_cast<std::chrono::nanoseconds>( + t_end - t_start) + .count(); + computing_time += (dt); + t_start = std::chrono::high_resolution_clock::now(); #endif + } } - } + } // for each group #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN t_end = std::chrono::high_resolution_clock::now(); diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc index cd991ca..988a27b 100644 --- a/src/PackAMatrix.cc +++ b/src/PackAMatrix.cc @@ -20,15 +20,17 @@ PackAMatrix<T, accT>::PackAMatrix( const T* smat, int32_t ld, inpType* pmat, - int32_t groups, + int groups, std::int32_t zero_pt) - : PackMatrix<PackAMatrix<T, accT>, T, accT>(nRow, nCol, pmat, zero_pt), + : PackMatrix<PackAMatrix<T, accT>, T, accT>( + nRow, + nCol, + pmat, + groups, + zero_pt), trans_(trans), smat_(smat), - ld_(ld), - G_(groups) { - assert(G_ == 1 && "Groups != 1 not supported yet"); - + ld_(ld) { if (cpuinfo_has_x86_avx512f()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; @@ -43,6 +45,11 @@ PackAMatrix<T, accT>::PackAMatrix( // TODO: Have default slower path assert(0 && "unsupported architecure"); } + if (BaseType::numCols() % groups != 0) { + throw std::runtime_error( + "groups = " + std::to_string(groups) + + " does not divide numCols = " + std::to_string(BaseType::numCols())); + } if (!pmat) { BaseType::buf_ = (T*)fbgemmAlignedAlloc( 64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)); @@ -63,7 +70,7 @@ void PackAMatrix<T, accT>::pack(const block_type_t& block) { if (tr) { for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { - T val = smat_[i + ld_ * j]; + T val = smat_[i + j * ld_]; out[addr(i, j) - addr(block.row_start, block.col_start)] = val; } // zero fill diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index 71efced..7ca7059 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -36,6 +36,7 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col( std::multiplies<int>()) * conv_p.IC, pmat, + conv_p.G, zero_pt), conv_p_(conv_p), sdata_(sdata) { @@ -55,6 +56,11 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col( // TODO: Have default slower path assert(0 && "unsupported architecure"); } + if (BaseType::numCols() % conv_p.G != 0) { + throw std::runtime_error( + "groups = " + std::to_string(conv_p.G) + + " does not divide numCols = " + std::to_string(BaseType::numCols())); + } if (pmat) { BaseType::buf_ = pmat; } else { diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index 485afb1..1bb7d4b 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -20,15 +20,17 @@ PackBMatrix<T, accT>::PackBMatrix( const T* smat, int32_t ld, inpType* pmat, - int32_t groups, + int groups, std::int32_t zero_pt) - : PackMatrix<PackBMatrix<T, accT>, T, accT>(nRow, nCol, pmat, zero_pt), + : PackMatrix<PackBMatrix<T, accT>, T, accT>( + nRow, + nCol, + pmat, + groups, + zero_pt), trans_(trans), smat_(smat), - ld_(ld), - G_(groups) { - assert(G_ == 1 && "Groups != 1 not supported yet"); - + ld_(ld) { if (cpuinfo_has_x86_avx512f()) { BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB; BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB; @@ -42,14 +44,22 @@ PackBMatrix<T, accT>::PackBMatrix( // Error assert(0 && "unknown architecure"); } - block_type_t block{0, BaseType::numRows(), 0, BaseType::numCols()}; + if (BaseType::numRows() % groups != 0) { + throw std::runtime_error( + "groups = " + std::to_string(groups) + + " does not divide numRows = " + std::to_string(BaseType::numRows())); + } + + // blocking for one group + block_type_t block{ + 0, BaseType::numRows() / BaseType::numGroups(), 0, BaseType::numCols()}; BaseType::packedBlock(block); if (!pmat) { BaseType::bufAllocatedHere_ = true; BaseType::buf_ = (T*)fbgemmAlignedAlloc( 64, - BaseType::blockRows() * BaseType::brow_ * BaseType::blockCols() * - BaseType::bcol_ * sizeof(T)); + BaseType::numGroups() * BaseType::blockRows() * BaseType::brow_ * + BaseType::blockCols() * BaseType::bcol_ * sizeof(T)); } pack(block); } @@ -59,26 +69,28 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) { assert((BaseType::blockRowSize() % row_interleave_) == 0); BaseType::packedBlock(block); - T* out = BaseType::getBuf(); bool tr = (trans_ == matrix_op_t::Transpose); - for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { - for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { - T val = tr ? smat_[i + ld_ * j] : smat_[i * ld_ + j]; - out[addr(i, j) - addr(block.row_start, block.col_start)] = - tconv(val, out[addr(i, j)]); + for (int g = 0; g < this->numGroups(); ++g) { + T* out = BaseType::getBuf() + + g * this->packedBufferSize(block.row_size, block.col_size); + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { + T val = tr ? smat_[g * block.row_size + i + ld_ * j] + : smat_[(g * block.row_size + i) * ld_ + j]; + out[addr(i, j)] = tconv(val, out[addr(i, j)]); + } } - } - // fill the remaining with zero. - // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill. - for (int i = block.row_start + block.row_size; - i < (block.row_start + block.row_size + row_interleave_ - 1) / - row_interleave_ * row_interleave_; - ++i) { - for (int j = block.col_start; j < block.col_start + block.col_size; j++) { - out[addr(i, j) - addr(block.row_start, block.col_start)] = - tconv(0, out[addr(i, j)]); + // fill the remaining with zero. + // Please see the comment in PackAMatrix.cc on zero vs zero_pt fill. + for (int i = block.row_start + block.row_size; + i < (block.row_start + block.row_size + row_interleave_ - 1) / + row_interleave_ * row_interleave_; + ++i) { + for (int j = block.col_start; j < block.col_start + block.col_size; j++) { + out[addr(i, j)] = tconv(0, out[addr(i, j)]); + } } - } + } // for each group } template <typename T, typename accT> @@ -109,7 +121,8 @@ void PackBMatrix<T, accT>::printPackedMatrix(std::string name) { << "[" << BaseType::blockRowSize() << ", " << BaseType::blockColSize() << "]" << std::endl; - T* out = BaseType::getBuf(); + T* out = BaseType::getBuf() + + this->packedBufferSize(this->numPackedRows(), this->numPackedCols()); for (auto nr = 0; nr < BaseType::blockRows(); ++nr) { auto rows = (nr == BaseType::blockRows() - 1) ? BaseType::lastBrow() : BaseType::blockRowSize(); @@ -150,7 +163,8 @@ bool PackBMatrix<T, accT>::metaEquals(const PackBMatrix<T, accT>& that) const { BaseType::numPackedRows() != that.numPackedRows() || BaseType::numPackedCols() != that.numPackedCols() || BaseType::zeroPoint() != that.zeroPoint() || trans_ != that.trans_ || - G_ != that.G_ || row_interleave_ != that.row_interleave_) { + BaseType::numGroups() != that.numGroups() || + row_interleave_ != that.row_interleave_) { return false; } diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc index b8b8388..a577057 100644 --- a/src/PackMatrix.cc +++ b/src/PackMatrix.cc @@ -18,8 +18,9 @@ PackMatrix<PT, inpType, accType>::PackMatrix( int32_t rows, int32_t cols, inpType* buf, + int groups, int32_t zero_pt) - : buf_(buf), nrows_(rows), ncols_(cols), zero_pt_(zero_pt) { + : buf_(buf), nrows_(rows), ncols_(cols), G_(groups), zero_pt_(zero_pt) { bufAllocatedHere_ = false; if (!cpuinfo_initialize()) { throw std::runtime_error("Failed to initialize cpuinfo!"); diff --git a/src/PackWithQuantRowOffset.cc b/src/PackWithQuantRowOffset.cc index a3d7e0f..2bd4b15 100644 --- a/src/PackWithQuantRowOffset.cc +++ b/src/PackWithQuantRowOffset.cc @@ -25,21 +25,19 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset( inpType* pmat, float scale, int32_t zero_pt, - int32_t groups, + int groups, int32_t* row_offset) : PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT>( nRow, nCol, pmat, + groups, zero_pt), trans_(trans), smat_(smat), ld_(ld), scale_(scale), - G_(groups), row_offset_(row_offset) { - assert(G_ == 1 && "Groups != 1 not supported yet"); - rowOffsetAllocatedHere = false; if (cpuinfo_has_x86_avx512f()) { @@ -56,6 +54,11 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset( // TODO: Have default slower path assert(0 && "unknown architecure"); } + if (BaseType::numCols() % groups != 0) { + throw std::runtime_error( + "groups = " + std::to_string(groups) + + " does not divide numCols = " + std::to_string(BaseType::numCols())); + } if (pmat) { BaseType::buf_ = pmat; } else { @@ -73,7 +76,6 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset( template <typename T, typename accT> void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) { assert(block.row_start % BaseType::blockRowSize() == 0); - assert(block.col_start % BaseType::blockColSize() == 0); assert(block.row_size <= BaseType::blockRowSize()); assert(block.col_size <= BaseType::blockColSize()); @@ -88,7 +90,8 @@ void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) { T* out = BaseType::getBuf(); bool tr = (trans_ == matrix_op_t::Transpose); // accumulate into row offset? - bool row_offset_acc = (block.col_start != 0); + bool row_offset_acc = + (block.col_start % (this->numCols() / this->numGroups())) != 0; int32_t* row_offset_buf = getRowOffsetBuffer(); float smat_transposed[block.row_size * block.col_size]; diff --git a/src/PackWithRowOffset.cc b/src/PackWithRowOffset.cc index dec3f70..1935161 100644 --- a/src/PackWithRowOffset.cc +++ b/src/PackWithRowOffset.cc @@ -22,21 +22,19 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset( const T* smat, uint32_t ld, inpType* pmat, - uint32_t groups, + int groups, int32_t zero_pt, int32_t* row_offset) : PackMatrix<PackAWithRowOffset<T, accT>, T, accT>( nRow, nCol, pmat, + groups, zero_pt), trans_(trans), smat_(smat), ld_(ld), - G_(groups), row_offset_(row_offset) { - assert(G_ == 1 && "Groups != 1 not supported yet"); - rowOffsetAllocatedHere = false; if (cpuinfo_has_x86_avx512f()) { @@ -53,6 +51,11 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset( // TODO: Have default slower path assert(0 && "unknown architecure"); } + if (BaseType::numCols() % groups != 0) { + throw std::runtime_error( + "groups = " + std::to_string(groups) + + " does not divide numCols = " + std::to_string(BaseType::numCols())); + } if (pmat) { BaseType::buf_ = pmat; } else { @@ -70,7 +73,6 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset( template <typename T, typename accT> void PackAWithRowOffset<T, accT>::pack(const block_type_t& block) { assert(block.row_start % BaseType::blockRowSize() == 0); - assert(block.col_start % BaseType::blockColSize() == 0); assert(block.row_size <= BaseType::blockRowSize()); assert(block.col_size <= BaseType::blockColSize()); @@ -85,7 +87,8 @@ void PackAWithRowOffset<T, accT>::pack(const block_type_t& block) { T* out = BaseType::getBuf(); bool tr = (trans_ == matrix_op_t::Transpose); // accumulate into row offset? - bool row_offset_acc = (block.col_start != 0); + bool row_offset_acc = + (block.col_start % (this->numCols() / this->numGroups())) != 0; int32_t* row_offset_buf = getRowOffsetBuffer(); if (tr) { for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index dc41c27..45eb9f9 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -93,8 +93,8 @@ void matmul_u8i8acc32_ref( const uint8_t* Aint8, const int8_t* Bint8, int32_t* Cint32) { - for (int j = 0; j < N; ++j) { - for (int i = 0; i < M; ++i) { + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { int32_t sum = 0; for (int k = 0; k < K; ++k) { sum += static_cast<int32_t>(Aint8[i * lda + k]) * @@ -116,8 +116,8 @@ void matmul_u8i8acc16_ref( const uint8_t* Aint8, const int8_t* Bint8, int32_t* Cint32) { - for (int j = 0; j < N; ++j) { - for (int i = 0; i < M; ++i) { + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { int32_t sum = 0, sum_32bit = 0; for (int k = 0; k < K; k += 2) { int a0 = Aint8[i * lda + k]; @@ -148,8 +148,8 @@ void matmul_fp_ref( const float* Afp32, const float* Bfp32, float* Cfp32) { - for (int j = 0; j < N; ++j) { - for (int i = 0; i < M; ++i) { + for (int i = 0; i < M; ++i) { + for (int j = 0; j < N; ++j) { float sum = 0; for (int k = 0; k < K; ++k) { sum += Afp32[i * lda + k] * Bfp32[k * ldb + j]; @@ -198,8 +198,10 @@ void spmdm_ref( fbgemm::CompressedSparseColumn& B, bool accumulation, int32_t* C, - int ldc) { + int ldc, + int groups /*=1*/) { int N = B.NumOfCols(); + assert(N % groups == 0); if (!accumulation) { for (int i = 0; i < M; ++i) { for (int j = 0; j < N; ++j) { @@ -207,15 +209,17 @@ void spmdm_ref( } } } - for (int j = 0; j < N; ++j) { - for (int k = B.ColPtr()[j]; k < B.ColPtr()[j + 1]; ++k) { - int row = B.RowIdx()[k]; - int w = B.Values()[k]; - for (int i = 0; i < M; ++i) { - C[i * ldc + j] += A[i * lda + row] * w; + for (int g = 0; g < groups; ++g) { + for (int j = g * (N / groups); j < (g + 1) * (N / groups); ++j) { + for (int k = B.ColPtr()[j]; k < B.ColPtr()[j + 1]; ++k) { + int row = g * B.NumOfRows() + B.RowIdx()[k]; + int w = B.Values()[k]; + for (int i = 0; i < M; ++i) { + C[i * ldc + j] += A[i * lda + row] * w; + } } - } - } // for each column of B + } // for each column of B + } // for each group } int32_t clip_16bit(int32_t x) { diff --git a/src/RefImplementations.h b/src/RefImplementations.h index 9e81ce1..cec4bff 100644 --- a/src/RefImplementations.h +++ b/src/RefImplementations.h @@ -125,6 +125,10 @@ void col_offsets_with_zero_pt_s8acc32_ref( /** * @brief Reference implementation of SPMDM (sparse matrix times dense matrix). + * + * @param groups when > 1, for gth group, we multiply + * A[:,g*(A.ncols/groups):(g+1)*(A.ncols/groups)] sub-matrix with + * B[:,g*(B.ncols/groups):(g+1)*(B.ncols/groups)] sub-matrix . */ void spmdm_ref( int M, @@ -133,7 +137,8 @@ void spmdm_ref( CompressedSparseColumn& B, bool accumulation, std::int32_t* C, - int ldc); + int ldc, + int groups = 1); /* * @brief Trim a 32-bit integer to a 16-bit integer. diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc index 391b993..bab3e72 100644 --- a/test/Im2ColFusedRequantizeTest.cc +++ b/test/Im2ColFusedRequantizeTest.cc @@ -135,7 +135,11 @@ TEST(FBGemmIm2colTest, Acc32Test) { PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize()); PackAWithIm2Col<uint8_t, int32_t> packA( - conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data()); + conv_p, + Aint8.data(), + nullptr, + Aint8_zero_point, + row_offset_buf.data()); PackBMatrix<int8_t, int32_t> packedB( matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); @@ -254,7 +258,11 @@ TEST(FBGemmIm2colTest, Acc16Test) { PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize()); PackAWithIm2Col<uint8_t, int16_t> packA( - conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data()); + conv_p, + Aint8.data(), + nullptr, + Aint8_zero_point, + row_offset_buf.data()); PackBMatrix<int8_t, int16_t> packedB( matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); @@ -452,7 +460,11 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) { PackAWithIm2Col<uint8_t, int32_t, 3>::rowOffsetBufferSize()); PackAWithIm2Col<uint8_t, int32_t, 3> packA( - conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data()); + conv_p, + Aint8.data(), + nullptr, + Aint8_zero_point, + row_offset_buf.data()); PackBMatrix<int8_t, int32_t> packedB( matrix_op_t::NoTranspose, @@ -588,7 +600,11 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) { PackAWithIm2Col<uint8_t, int16_t, 3>::rowOffsetBufferSize()); PackAWithIm2Col<uint8_t, int16_t, 3> packA( - conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data()); + conv_p, + Aint8.data(), + nullptr, + Aint8_zero_point, + row_offset_buf.data()); PackBMatrix<int8_t, int16_t> packedB( matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc index 82ae96f..7f0a7c7 100644 --- a/test/PackedRequantizeAcc16Test.cc +++ b/test/PackedRequantizeAcc16Test.cc @@ -82,141 +82,169 @@ TEST_P(fbgemmu8s8acc16test, Test) { tie(atrans, btrans, test_ld) = GetParam(); for (auto shape : shapes) { - int m = shape[0]; - int n = shape[1]; - int k = shape[2]; - - aligned_vector<uint8_t> Aint8(m * k, 0); - aligned_vector<int8_t> Bint8(k * n, 0); - aligned_vector<int8_t> Bint8_ref(k * n, 0); - aligned_vector<int32_t> Cint32_local(m * n, 0); - aligned_vector<int32_t> Cint32_buffer(m * n, 0); - aligned_vector<int32_t> Cint32_fb(m * n, 0); - aligned_vector<uint8_t> Cint8_fb(m * n, 0); - aligned_vector<uint8_t> Cint8_local(m * n, 0); - - randFill(Aint8, 0, 255); - int32_t Aint8_zero_point = 43; - - randFill(Bint8_ref, -128, 127); - - for (auto i = 0; i < Bint8.size(); ++i) { - Bint8[i] = Bint8_ref[i]; - } - - if (btrans == matrix_op_t::Transpose) { - transpose_matrix(Bint8.data(), k, n); - } - - int32_t Bint8_zero_point = -30; - // To test lda != k , we just reduce k by half and use the original k - // as lda. - int k_adjusted = k; - int n_adjusted = n; - if (test_ld) { - assert( - atrans == matrix_op_t::NoTranspose && "This case is not handled yet"); - k_adjusted = std::max(k / 2, 1); - if (btrans == matrix_op_t::NoTranspose) { - n_adjusted = std::max(n / 2, 1); + for (int groups : {1, 3, 4}) { + int m = shape[0]; + int n = shape[1]; + int k = shape[2]; + if (k % groups != 0) { + continue; } - } - - // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(n_adjusted); - col_offsets_with_zero_pt_s8acc32_ref( - k_adjusted, - n_adjusted, - n, - Bint8_ref.data(), - Bint8_zero_point, - col_offsets.data()); - - vector<int32_t> row_offsets; - row_offsets.resize(m); - - float C_multiplier = 0.1234; - int32_t C_zero_pt = 5; - - int brow = 256; - matmul_u8i8acc16_ref( - m, - n_adjusted, - k_adjusted, - k, - n, - n, - brow, - Aint8.data(), - Bint8_ref.data(), - Cint32_local.data()); - - row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data()); - - requantize_u8acc32_ref( - m, - n_adjusted, - n, - Cint32_local.data(), - Cint8_local.data(), - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - row_offsets.data(), - col_offsets.data(), - nullptr); - - vector<int32_t> row_offset_buf; - row_offset_buf.resize( - PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); - - PackAWithRowOffset<uint8_t, int16_t> packAN( - matrix_op_t::NoTranspose, - m, - k_adjusted, - Aint8.data(), - k, - nullptr, - 1, - Aint8_zero_point, - row_offset_buf.data()); - - PackBMatrix<int8_t, int16_t> packedBN( - btrans, - k_adjusted, - n_adjusted, - Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, - nullptr, - 1, - Bint8_zero_point); - - DoNothing<> doNothingObj{}; - ReQuantizeOutput<false> outputProcObj( - doNothingObj, - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - nullptr); - - fbgemmPacked( - packAN, - packedBN, - Cint8_fb.data(), - Cint32_buffer.data(), - n, - outputProcObj, - 0, - 1); - - compare_validate_buffers( - Cint8_local.data(), Cint8_fb.data(), m, n, n, static_cast<uint8_t>(0)); - } + int k_per_group = k / groups; + + aligned_vector<uint8_t> Aint8(m * k, 0); + + aligned_vector<int8_t> Bint8(k * n, 0); + aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0); + + aligned_vector<int32_t> Cint32_ref(m * n * groups, 0); + aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); + aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0); + aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0); + + randFill(Aint8, 0, 255); + int32_t Aint8_zero_point = 43; + + randFill(Bint8_ref, -128, 127); + Bint8 = Bint8_ref; + + if (btrans == matrix_op_t::Transpose) { + aligned_vector<int8_t> Bint8_temp(Bint8.size()); + for (int g = 0; g < groups; ++g) { + transpose_matrix( + k_per_group, + n, + Bint8.data() + g * k_per_group * n, + n, + Bint8_temp.data() + g * k_per_group, + groups * k_per_group); + } + Bint8 = Bint8_temp; + } + + int32_t Bint8_zero_point = -30; + // To test lda != k , we just reduce k by half and use the original k + // as lda. + int n_adjusted = n; + if (test_ld) { + assert( + atrans == matrix_op_t::NoTranspose && + "This case is not handled yet"); + if (btrans == matrix_op_t::NoTranspose) { + n_adjusted = std::max(n / 2, 1); + } + } + + // computing column offset + vector<int32_t> col_offsets; + col_offsets.resize(groups * n_adjusted); + for (int g = 0; g < groups; ++g) { + col_offsets_with_zero_pt_s8acc32_ref( + k_per_group, + n_adjusted, + n, + Bint8_ref.data() + g * k_per_group * n, + Bint8_zero_point, + col_offsets.data() + g * n_adjusted); + } + + vector<int32_t> row_offsets; + row_offsets.resize(m); + + float C_multiplier = 0.1234; + int32_t C_zero_pt = 5; + + int brow = 256; + for (int g = 0; g < groups; ++g) { + matmul_u8i8acc16_ref( + m, + n_adjusted, + k_per_group, + k, + n, + groups * n, + brow, + Aint8.data() + g * k_per_group, + Bint8_ref.data() + g * k_per_group * n, + Cint32_ref.data() + g * n_adjusted); + + row_offsets_u8acc32_ref( + m, + k_per_group, + k, + Aint8.data() + g * k_per_group, + row_offsets.data()); + + requantize_u8acc32_ref( + m, + n_adjusted, + groups * n, + Cint32_ref.data() + g * n_adjusted, + Cint8_ref.data() + g * n_adjusted, + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + row_offsets.data(), + col_offsets.data() + g * n_adjusted, + nullptr); + } + + vector<int32_t> row_offset_buf; + row_offset_buf.resize( + PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); + + PackAWithRowOffset<uint8_t, int16_t> packAN( + matrix_op_t::NoTranspose, + m, + k, + Aint8.data(), + k, + nullptr, + groups, + Aint8_zero_point, + row_offset_buf.data()); + + PackBMatrix<int8_t, int16_t> packedBN( + btrans, + k, + n_adjusted, + Bint8.data(), + (btrans == matrix_op_t::Transpose) ? k : n, + nullptr, + groups, + Bint8_zero_point); + + DoNothing<> doNothingObj{}; + ReQuantizeOutput<false> outputProcObj( + doNothingObj, + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + 0, + 1); + + compare_validate_buffers( + Cint8_ref.data(), + Cint8_fb.data(), + m, + groups * n_adjusted, + groups * n, + static_cast<uint8_t>(0)); + } // for each groups + } // for each shape } /** @@ -230,180 +258,244 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { tie(atrans, btrans, test_ld) = GetParam(); for (auto shape : shapes) { - int m = shape[0]; - int n = shape[1]; - int k = shape[2]; - - aligned_vector<uint8_t> Aint8(m * k, 0); - aligned_vector<int8_t> Bint8(k * n, 0); - aligned_vector<int8_t> Bint8_ref(k * n, 0); - aligned_vector<int32_t> Cint32_local(m * n, 0); - aligned_vector<int32_t> Cint32_buffer(m * n, 0); - aligned_vector<int32_t> Cint32_fb(m * n, 0); - aligned_vector<uint8_t> Cint8_fb(m * n, 0); - aligned_vector<uint8_t> Cint8_local(m * n, 0); - - randFill(Aint8, 0, 255); - int32_t Aint8_zero_point = 43; - - randFill(Bint8, -128, 127); - - // To test lda != k , we just reduce k by half and use the original k - // as lda. - int k_adjusted = k; - int n_adjusted = n; - if (test_ld) { - assert( - atrans == matrix_op_t::NoTranspose && "This case is not handled yet"); - k_adjusted = std::max(k / 2, 1); - if (btrans == matrix_op_t::NoTranspose) { - n_adjusted = std::max(n / 2, 1); - } - } - - int32_t Bint8_zero_point = -30; - // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(n_adjusted); - col_offsets_with_zero_pt_s8acc32_ref( - k_adjusted, - n_adjusted, - n, - Bint8.data(), - Bint8_zero_point, - col_offsets.data()); - - CompressedSparseColumn B_csc(k_adjusted, n_adjusted); - float density = 0.001f; - // deterministic random number - default_random_engine eng; - binomial_distribution<> per_col_nnz_dist(k_adjusted, density); - uniform_int_distribution<> value_dist( - numeric_limits<int8_t>::min() / 2, numeric_limits<int8_t>::max() / 2); - - vector<int> row_indices(k_adjusted); - int total_nnz = 0; - for (int j = 0; j < n_adjusted; ++j) { - B_csc.ColPtr()[j] = total_nnz; - - int nnz_of_j = per_col_nnz_dist(eng); - total_nnz += nnz_of_j; - - iota(row_indices.begin(), row_indices.end(), 0); - shuffle(row_indices.begin(), row_indices.end(), eng); - sort(row_indices.begin(), row_indices.begin() + nnz_of_j); - - for (int kidx = 0; kidx < nnz_of_j; ++kidx) { - B_csc.RowIdx().push_back(row_indices[kidx]); - // put the current B value - B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]); - // make current B value zero - Bint8[row_indices[kidx] * n + j] = 0; - } - } - B_csc.ColPtr()[n_adjusted] = total_nnz; - - for (auto i = 0; i < Bint8.size(); ++i) { - Bint8_ref[i] = Bint8[i]; - } - - if (btrans == matrix_op_t::Transpose) { - transpose_matrix(Bint8.data(), k, n); - } - - vector<int32_t> row_offsets; - row_offsets.resize(m); - - float C_multiplier = 0.1234; - int32_t C_zero_pt = 5; - - int brow = 256; - matmul_u8i8acc16_ref( - m, - n_adjusted, - k_adjusted, - k, - n, - n, - brow, - Aint8.data(), - Bint8_ref.data(), - Cint32_local.data()); - - bool accumulation = true; - spmdm_ref(m, Aint8.data(), k, B_csc, accumulation, Cint32_local.data(), n); - - row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data()); - - requantize_u8acc32_ref( - m, - n_adjusted, - n, - Cint32_local.data(), - Cint8_local.data(), - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - row_offsets.data(), - col_offsets.data(), - nullptr); - - vector<int32_t> row_offset_buf; - row_offset_buf.resize( - PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); - - PackAWithRowOffset<uint8_t, int16_t> packAN( - matrix_op_t::NoTranspose, - m, - k_adjusted, - Aint8.data(), - k, - nullptr, - 1, - Aint8_zero_point, - row_offset_buf.data()); - - // spmdm -> requantization -> nothing - // construct an output processing pipeline in reverse order - // i.e. last output operation first - // Last operation should always be DoNothing with - // correct input and output type. - DoNothing<> doNothingObj{}; - // The second last operation is requantization back - // to int8 - ReQuantizeOutput<false> reqObj( - doNothingObj, - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - nullptr); - // the top most (first) operation in the output processing - // pipeline is spmdm - // outType = final output type after fullly processing through pipeline - // inType = initial input type at the first call to the whole pipeline - DoSpmdmOnInpBuffer< - ReQuantizeOutput<false>::outType, - int32_t, - ReQuantizeOutput<false>> - spmdmObj(reqObj, Aint8.data(), k, B_csc); - - PackBMatrix<int8_t, int16_t> packedB( - btrans, - k_adjusted, - n_adjusted, - Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n); - - fbgemmPacked( - packAN, packedB, Cint8_fb.data(), Cint32_fb.data(), n, spmdmObj, 0, 1); - - compare_validate_buffers( - Cint8_local.data(), Cint8_fb.data(), m, n, n, static_cast<uint8_t>(0)); - } + for (int groups : {1, 3, 4}) { + // very small density to test hyper sparsity case + // moderate density to test the implementation using transpose + for (float density : {0.0001f, 0.1f}) { + int m = shape[0]; + int n = shape[1]; + int k = shape[2]; + if (k % groups != 0) { + continue; + } + int k_per_group = k / groups; + + aligned_vector<uint8_t> Aint8(m * k, 0); + + aligned_vector<int8_t> Bint8(k * n, 0); + aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0); + + aligned_vector<int32_t> Cint32_ref(m * n * groups, 0); + aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); + aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0); + aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0); + + randFill(Aint8, 0, 255); + int32_t Aint8_zero_point = 43; + + randFill(Bint8, -128, 127); + + // To test lda != k , we just reduce k by half and use the original k + // as lda. + int n_adjusted = n; + if (test_ld) { + assert( + atrans == matrix_op_t::NoTranspose && + "This case is not handled yet"); + if (btrans == matrix_op_t::NoTranspose) { + n_adjusted = std::max(n / 2, 1); + } + } + + int32_t Bint8_zero_point = -30; + // computing column offset + vector<int32_t> col_offsets; + col_offsets.resize(groups * n_adjusted); + for (int g = 0; g < groups; ++g) { + col_offsets_with_zero_pt_s8acc32_ref( + k_per_group, + n_adjusted, + n, + Bint8_ref.data() + g * k_per_group * n, + Bint8_zero_point, + col_offsets.data() + g * n_adjusted); + } + + CompressedSparseColumn B_csc(k_per_group, groups * n_adjusted); + // Make sure density is big enough. Otherwise, we're not really testing + // spmdm. + // deterministic random number + default_random_engine eng; + binomial_distribution<> per_col_nnz_dist(k_per_group, density); + + vector<int> row_indices(k_per_group); + int total_nnz = 0; + for (int g = 0; g < groups; ++g) { + for (int j = 0; j < n_adjusted; ++j) { + B_csc.ColPtr()[g * n_adjusted + j] = total_nnz; + + int nnz_of_j = per_col_nnz_dist(eng); + total_nnz += nnz_of_j; + + iota(row_indices.begin(), row_indices.end(), 0); + shuffle(row_indices.begin(), row_indices.end(), eng); + sort(row_indices.begin(), row_indices.begin() + nnz_of_j); + + for (int kidx = 0; kidx < nnz_of_j; ++kidx) { + int rowidx = row_indices[kidx]; + B_csc.RowIdx().push_back(rowidx); + int8_t* bptr = &Bint8[(g * k_per_group + rowidx) * n + j]; + int b_remainder = 0; + if (kidx % 2 == 1) { + // Make sure abs(b_prev + *bptr - b_remainder) <= 128 + int b_prev = B_csc.Values().back(); + b_remainder = std::max(b_prev + *bptr - 128, b_remainder); + b_remainder = std::min(b_prev + *bptr + 128, b_remainder); + } + // put a portion of current B value that won't saturate + // _mm256_maddubs_epi16 . + B_csc.Values().push_back(*bptr - b_remainder); + // put the remainder + *bptr = b_remainder; + } + } + } + B_csc.ColPtr()[groups * n_adjusted] = total_nnz; + + Bint8_ref = Bint8; + + if (btrans == matrix_op_t::Transpose) { + aligned_vector<int8_t> Bint8_temp(Bint8.size()); + for (int g = 0; g < groups; ++g) { + transpose_matrix( + k_per_group, + n, + Bint8.data() + g * k_per_group * n, + n, + Bint8_temp.data() + g * k_per_group, + groups * k_per_group); + } + Bint8 = Bint8_temp; + } + + vector<int32_t> row_offsets; + row_offsets.resize(m); + + float C_multiplier = 0.1234; + int32_t C_zero_pt = 5; + + int brow = 256; + for (int g = 0; g < groups; ++g) { + matmul_u8i8acc16_ref( + m, + n_adjusted, + k_per_group, + k, + n, + groups * n, + brow, + Aint8.data() + g * k_per_group, + Bint8_ref.data() + g * k_per_group * n, + Cint32_ref.data() + g * n_adjusted); + } + + bool accumulation = true; + spmdm_ref( + m, + Aint8.data(), + k, + B_csc, + accumulation, + Cint32_ref.data(), + groups * n, + groups); + + for (int g = 0; g < groups; ++g) { + row_offsets_u8acc32_ref( + m, + k_per_group, + k, + Aint8.data() + g * k_per_group, + row_offsets.data()); + + requantize_u8acc32_ref( + m, + n_adjusted, + groups * n, + Cint32_ref.data() + g * n_adjusted, + Cint8_ref.data() + g * n_adjusted, + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + row_offsets.data(), + col_offsets.data() + g * n_adjusted, + nullptr); + } + + vector<int32_t> row_offset_buf; + row_offset_buf.resize( + PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); + + PackAWithRowOffset<uint8_t, int16_t> packAN( + matrix_op_t::NoTranspose, + m, + k, + Aint8.data(), + k, + nullptr, + groups, + Aint8_zero_point, + row_offset_buf.data()); + + // spmdm -> requantization -> nothing + // construct an output processing pipeline in reverse order + // i.e. last output operation first + // Last operation should always be DoNothing with + // correct input and output type. + DoNothing<> doNothingObj{}; + // The second last operation is requantization back + // to int8 + ReQuantizeOutput<false> reqObj( + doNothingObj, + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr); + // the top most (first) operation in the output processing + // pipeline is spmdm + // outType = final output type after fullly processing through pipeline + // inType = initial input type at the first call to the whole pipeline + DoSpmdmOnInpBuffer< + ReQuantizeOutput<false>::outType, + int32_t, + ReQuantizeOutput<false>> + spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); + + PackBMatrix<int8_t, int16_t> packedB( + btrans, + k, + n_adjusted, + Bint8.data(), + (btrans == matrix_op_t::Transpose) ? k : n, + nullptr, + groups, + Bint8_zero_point); + + fbgemmPacked( + packAN, + packedB, + Cint8_fb.data(), + Cint32_fb.data(), + groups * n, + spmdmObj, + 0, + 1); + + compare_validate_buffers( + Cint8_ref.data(), + Cint8_fb.data(), + m, + groups * n_adjusted, + groups * n, + static_cast<uint8_t>(0)); + } // for each density + } // for each groups + } // for each shape } /** @@ -417,119 +509,140 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { tie(atrans, btrans, test_ld) = GetParam(); for (auto shape : shapes) { - int m = shape[0]; - int n = shape[1]; - int k = shape[2]; - - aligned_vector<uint8_t> Aint8(m * k, 0); - aligned_vector<int8_t> Bint8(k * n, 0); - aligned_vector<int8_t> Bint8_ref(k * n, 0); - aligned_vector<int32_t> Cint32_local(m * n, 0); - aligned_vector<int32_t> Cint32_buffer(m * n, 0); - aligned_vector<int32_t> Cint32_fb(m * n, 0); - aligned_vector<uint8_t> Cint8_fb(m * n, 0); - aligned_vector<uint8_t> Cint8_local(m * n, 0); - - randFill(Aint8, 0, 255); - int32_t Aint8_zero_point = 43; - - randFill(Bint8_ref, -128, 127); - - for (auto i = 0; i < Bint8.size(); ++i) { - Bint8[i] = Bint8_ref[i]; - } - - if (btrans == matrix_op_t::Transpose) { - transpose_matrix(Bint8.data(), k, n); - } - - int32_t Bint8_zero_point = -30; - // To test lda != k , we just reduce k by half and use the original k - // as lda. - int k_adjusted = k; - int n_adjusted = n; - if (test_ld) { - assert( - atrans == matrix_op_t::NoTranspose && "This case is not handled yet"); - k_adjusted = std::max(k / 2, 1); - if (btrans == matrix_op_t::NoTranspose) { - n_adjusted = std::max(n / 2, 1); + for (int groups : {1, 3, 4}) { + int m = shape[0]; + int n = shape[1]; + int k = shape[2]; + if (k % groups != 0) { + continue; + } + int k_per_group = k / groups; + + aligned_vector<uint8_t> Aint8(m * k, 0); + + aligned_vector<int8_t> Bint8(k * n, 0); + aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0); + + aligned_vector<int32_t> Cint32_ref(m * n * groups, 0); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); + aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0); + + randFill(Aint8, 0, 255); + int32_t Aint8_zero_point = 43; + + randFill(Bint8_ref, -128, 127); + Bint8 = Bint8_ref; + + if (btrans == matrix_op_t::Transpose) { + aligned_vector<int8_t> Bint8_temp(Bint8.size()); + for (int g = 0; g < groups; ++g) { + transpose_matrix( + k_per_group, + n, + Bint8.data() + g * k_per_group * n, + n, + Bint8_temp.data() + g * k_per_group, + groups * k_per_group); + } + Bint8 = Bint8_temp; } - } - - // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(n_adjusted); - col_offsets_with_zero_pt_s8acc32_ref( - k_adjusted, - n_adjusted, - n, - Bint8_ref.data(), - Bint8_zero_point, - col_offsets.data()); - - vector<int32_t> row_offsets; - row_offsets.resize(m); - - int brow = 256; - matmul_u8i8acc16_ref( - m, - n_adjusted, - k_adjusted, - k, - n, - n, - brow, - Aint8.data(), - Bint8_ref.data(), - Cint32_local.data()); - - row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data()); - - vector<int32_t> row_offset_buf; - row_offset_buf.resize( - PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); - - PackAWithRowOffset<uint8_t, int16_t> packAN( - matrix_op_t::NoTranspose, - m, - k_adjusted, - Aint8.data(), - k, - nullptr, - 1, - Aint8_zero_point, - row_offset_buf.data()); - - PackBMatrix<int8_t, int16_t> packedBN( - btrans, - k_adjusted, - n_adjusted, - Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, - nullptr, - 1, - Bint8_zero_point); - - // DoNothing<> doNothingObj{}; - DoNothing<int32_t, int32_t> doNothingObj{}; - memCopy<> outputProcObj(doNothingObj); - fbgemmPacked( - packAN, - packedBN, - Cint32_fb.data(), - Cint32_buffer.data(), - n, - outputProcObj, - 0, - 1); - - compare_validate_buffers( - Cint32_local.data(), - Cint32_fb.data(), - m, - n, - n, - static_cast<int32_t>(0)); - } + + int32_t Bint8_zero_point = -30; + // To test lda != k , we just reduce k by half and use the original k + // as lda. + int n_adjusted = n; + if (test_ld) { + assert( + atrans == matrix_op_t::NoTranspose && + "This case is not handled yet"); + if (btrans == matrix_op_t::NoTranspose) { + n_adjusted = std::max(n / 2, 1); + } + } + + // computing column offset + vector<int32_t> col_offsets; + col_offsets.resize(groups * n_adjusted); + for (int g = 0; g < groups; ++g) { + col_offsets_with_zero_pt_s8acc32_ref( + k_per_group, + n_adjusted, + n, + Bint8_ref.data() + g * k_per_group * n, + Bint8_zero_point, + col_offsets.data() + g * n_adjusted); + } + + vector<int32_t> row_offsets; + row_offsets.resize(m); + + int brow = 256; + for (int g = 0; g < groups; ++g) { + matmul_u8i8acc16_ref( + m, + n_adjusted, + k_per_group, + k, + n, + groups * n, + brow, + Aint8.data() + g * k_per_group, + Bint8_ref.data() + g * k_per_group * n, + Cint32_ref.data() + g * n_adjusted); + + row_offsets_u8acc32_ref( + m, + k_per_group, + k, + Aint8.data() + g * k_per_group, + row_offsets.data()); + } + + vector<int32_t> row_offset_buf; + row_offset_buf.resize( + PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); + + PackAWithRowOffset<uint8_t, int16_t> packAN( + matrix_op_t::NoTranspose, + m, + k, + Aint8.data(), + k, + nullptr, + groups, + Aint8_zero_point, + row_offset_buf.data()); + + PackBMatrix<int8_t, int16_t> packedBN( + btrans, + k, + n_adjusted, + Bint8.data(), + (btrans == matrix_op_t::Transpose) ? k : n, + nullptr, + groups, + Bint8_zero_point); + + // DoNothing<> doNothingObj{}; + DoNothing<int32_t, int32_t> doNothingObj{}; + memCopy<> outputProcObj(doNothingObj); + fbgemmPacked( + packAN, + packedBN, + Cint32_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + 0, + 1); + + compare_validate_buffers( + Cint32_ref.data(), + Cint32_fb.data(), + m, + groups * n_adjusted, + groups * n, + static_cast<int32_t>(0)); + } // for each groups + } // for each shape } diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc index e0c9850..7653989 100644 --- a/test/PackedRequantizeTest.cc +++ b/test/PackedRequantizeTest.cc @@ -83,145 +83,191 @@ TEST_P(fbgemmu8s8acc32test, Test) { tie(atrans, btrans, test_ld) = GetParam(); for (auto shape : shapes) { - int m = shape[0]; - int n = shape[1]; - int k = shape[2]; - - aligned_vector<uint8_t> Aint8(m * k, 0); - - // nxk matrix - aligned_vector<int8_t> Bint8(k * n, 0); - // kxn matrix - aligned_vector<int8_t> Bint8_ref(k * n, 0); - - aligned_vector<int32_t> Cint32_ref(m * n, 0.0f); - aligned_vector<int32_t> Cint32_fb(m * n, 0); - aligned_vector<uint8_t> Cint8_fb(m * n, 0); - aligned_vector<int32_t> Cint32_local(m * n, 0); - aligned_vector<int32_t> Cint32_buffer(m * n, 0); - aligned_vector<uint8_t> Cint8_local(m * n, 0); - - randFill(Aint8, 0, 255); - int32_t Aint8_zero_point = 43; - - randFill(Bint8_ref, -128, 127); - avoidOverflow(m, n, k, Aint8.data(), Bint8_ref.data()); - - for (auto i = 0; i < Bint8.size(); ++i) { - Bint8[i] = Bint8_ref[i]; - } - - if (btrans == matrix_op_t::Transpose) { - transpose_matrix(Bint8.data(), k, n); - } - - int32_t Bint8_zero_point = -30; - // To test lda != k , we just reduce k by half and use the original k - // as lda. - int k_adjusted = k; - int n_adjusted = n; - if (test_ld) { - assert( - atrans == matrix_op_t::NoTranspose && "This case is not handled yet"); - k_adjusted = std::max(k / 2, 1); - if (btrans == matrix_op_t::NoTranspose) { - n_adjusted = std::max(n / 2, 1); - } - } - - // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(n_adjusted); - col_offsets_with_zero_pt_s8acc32_ref( - k_adjusted, - n_adjusted, - n, - Bint8_ref.data(), - Bint8_zero_point, - col_offsets.data()); - - vector<int32_t> row_offsets; - row_offsets.resize(m); - - float C_multiplier = 0.1234; - int32_t C_zero_pt = 5; - - matmul_u8i8acc32_ref( - m, - n_adjusted, - k_adjusted, - k, - n, - n, - Aint8.data(), - Bint8_ref.data(), - Cint32_local.data()); - - row_offsets_u8acc32_ref(m, k_adjusted, k, Aint8.data(), row_offsets.data()); - - requantize_u8acc32_ref( - m, - n_adjusted, - n, - Cint32_local.data(), - Cint8_local.data(), - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - row_offsets.data(), - col_offsets.data(), - nullptr); - - vector<int32_t> row_offset_buf; - row_offset_buf.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize()); - - PackAWithRowOffset<uint8_t> packAN( - matrix_op_t::NoTranspose, - m, - k_adjusted, - Aint8.data(), - k, - nullptr, - 1, - Aint8_zero_point, - row_offset_buf.data()); - - PackBMatrix<int8_t> packedBN( - btrans, - k_adjusted, - n_adjusted, - Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, - nullptr, - 1, - Bint8_zero_point); - - DoNothing<> doNothingObj{}; - ReQuantizeOutput<false> outputProcObj( - doNothingObj, - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - nullptr); - - fbgemmPacked( - packAN, - packedBN, - Cint8_fb.data(), - Cint32_buffer.data(), - n, - outputProcObj, - 0, - 1); - // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(), - // m, n_adjusted, n, "C local"); - compare_validate_buffers( - Cint8_local.data(), Cint8_fb.data(), m, n, n, static_cast<uint8_t>(0)); - } + for (int groups : {1, 3, 4}) { + for (bool test_bias: {false, true}) { + int m = shape[0]; + int n = shape[1]; + int k = shape[2]; + if (k % groups != 0) { + continue; + } + int k_per_group = k / groups; + + // mxk matrix + aligned_vector<uint8_t> Aint8(m * k, 0); + + // kxn matrix + aligned_vector<int8_t> Bint8(k * n, 0); + aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0); + + aligned_vector<int32_t> Cint32_ref(m * n * groups, 0); + aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); + aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0); + aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0); + + randFill(Aint8, 0, 255); + int32_t Aint8_zero_point = 43; + + randFill(Bint8_ref, -128, 127); + for (int g = 0; g < groups; ++g) { + avoidOverflow( + m, + n, + k_per_group, + Aint8.data() + g * k_per_group, + k, + Bint8_ref.data() + g * k_per_group * n, + n); + } + + Bint8 = Bint8_ref; + + // initialize bias + aligned_vector<int32_t> bias_int32(groups * n); + int32_t* bias = nullptr; + if (test_bias) { + randFill(bias_int32, -128, 127); + bias = bias_int32.data(); + } + + if (btrans == matrix_op_t::Transpose) { + aligned_vector<int8_t> Bint8_temp(Bint8.size()); + for (int g = 0; g < groups; ++g) { + transpose_matrix( + k_per_group, + n, + Bint8.data() + g * k_per_group * n, + n, + Bint8_temp.data() + g * k_per_group, + groups * k_per_group); + } + Bint8 = Bint8_temp; + } + + int32_t Bint8_zero_point = -30; + // To test lda != k , we just reduce k by half and use the original k + // as lda. + int n_adjusted = n; + if (test_ld) { + assert( + atrans == matrix_op_t::NoTranspose && + "This case is not handled yet"); + if (btrans == matrix_op_t::NoTranspose) { + n_adjusted = std::max(n / 2, 1); + } + } + + // computing column offset + vector<int32_t> col_offsets; + col_offsets.resize(groups * n_adjusted); + for (int g = 0; g < groups; ++g) { + col_offsets_with_zero_pt_s8acc32_ref( + k_per_group, + n_adjusted, + n, + Bint8_ref.data() + g * k_per_group * n, + Bint8_zero_point, + col_offsets.data() + g * n_adjusted); + } + + vector<int32_t> row_offsets; + row_offsets.resize(m); + + float C_multiplier = 0.001234; + int32_t C_zero_pt = 5; + + for (int g = 0; g < groups; ++g) { + matmul_u8i8acc32_ref( + m, + n_adjusted, + k_per_group, + k, + n, + groups * n, + Aint8.data() + g * k_per_group, + Bint8_ref.data() + g * k_per_group * n, + Cint32_ref.data() + g * n_adjusted); + + row_offsets_u8acc32_ref( + m, + k_per_group, + k, + Aint8.data() + g * k_per_group, + row_offsets.data()); + + requantize_u8acc32_ref( + m, + n_adjusted, + groups * n, + Cint32_ref.data() + g * n_adjusted, + Cint8_ref.data() + g * n_adjusted, + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + row_offsets.data(), + col_offsets.data() + g * n_adjusted, + bias ? (bias + g * n_adjusted) : nullptr); + } + + vector<int32_t> row_offset_buf; + row_offset_buf.resize( + PackAWithRowOffset<uint8_t>::rowOffsetBufferSize()); + + PackAWithRowOffset<uint8_t> packAN( + matrix_op_t::NoTranspose, + m, + k, + Aint8.data(), + k, + nullptr, + groups, + Aint8_zero_point, + row_offset_buf.data()); + + PackBMatrix<int8_t> packedBN( + btrans, + k, + n_adjusted, + Bint8.data(), + (btrans == matrix_op_t::Transpose) ? k : n, + nullptr, + groups, + Bint8_zero_point); + + DoNothing<> doNothingObj{}; + ReQuantizeOutput<false> outputProcObj( + doNothingObj, + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + packAN.getRowOffsetBuffer(), + col_offsets.data(), + bias); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + 0, + 1); + // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(), + // m, n_adjusted, n, "C local"); + compare_validate_buffers( + Cint8_ref.data(), + Cint8_fb.data(), + m, + groups * n_adjusted, + groups * n, + static_cast<uint8_t>(0)); + } // test_bias + } // for each groups + } // for each shape } /** @@ -236,129 +282,163 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { tie(atrans, btrans, test_ld) = GetParam(); for (auto shape : shapes) { - int m = shape[0]; - int n = shape[1]; - int k = shape[2]; - - aligned_vector<float> Afp32(m * k, 0.0f); - aligned_vector<uint8_t> Aint8(m * k, 0); - - aligned_vector<float> Bfp32(k * n, 0.0f); - aligned_vector<int8_t> Bint8(k * n, 0); - - aligned_vector<float> Cfp32_ref(m * n, 0.0f); - aligned_vector<float> Cfp32_fb(m * n, 0.0f); - - aligned_vector<uint8_t> Cint8_fb(m * n, 0); - aligned_vector<int32_t> Cint32_buffer(m * n, 0); - - randFill(Aint8, 0, 255); - int32_t Aint8_zero_point = 43; - float Aint8_scale = 0.11; - for (auto i = 0; i < Afp32.size(); ++i) { - Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point); - } - - randFill(Bint8, -128, 127); - avoidOverflow(m, n, k, Aint8.data(), Bint8.data()); - int32_t Bint8_zero_point = -30; - float Bint8_scale = 0.49; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point); - } - - // To test lda != k , we just reduce k by half and use the original k - // as lda. - int k_adjusted = k; - int n_adjusted = n; - if (test_ld) { - assert( - atrans == matrix_op_t::NoTranspose && "This case is not handled yet"); - k_adjusted = std::max(k / 2, 1); - if (btrans == matrix_op_t::NoTranspose) { - n_adjusted = std::max(n / 2, 1); + for (int groups : {1, 3, 4}) { + int m = shape[0]; + int n = shape[1]; + int k = shape[2]; + if (k % groups != 0) { + continue; + } + int k_per_group = k / groups; + + aligned_vector<float> Afp32(m * k, 0.0f); + aligned_vector<uint8_t> Aint8(Afp32.size(), 0); + + aligned_vector<float> Bfp32(k * n, 0.0f); + aligned_vector<int8_t> Bint8(Bfp32.size(), 0); + + aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f); + aligned_vector<float> Cfp32_fb(Cfp32_ref.size(), 0.0f); + + aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size(), 0); + aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size(), 0); + + randFill(Aint8, 0, 255); + int32_t Aint8_zero_point = 43; + float Aint8_scale = 0.11; + for (auto i = 0; i < Afp32.size(); ++i) { + Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point); + } + + randFill(Bint8, -128, 127); + for (int g = 0; g < groups; ++g) { + avoidOverflow( + m, + n, + k_per_group, + Aint8.data() + g * k_per_group, + k, + Bint8.data() + g * k_per_group * n, + n); + } + int32_t Bint8_zero_point = -30; + float Bint8_scale = 0.49; + for (auto i = 0; i < Bfp32.size(); ++i) { + Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point); + } + + // To test lda != k , we just reduce k by half and use the original k + // as lda. + int n_adjusted = n; + if (test_ld) { + assert( + atrans == matrix_op_t::NoTranspose && + "This case is not handled yet"); + if (btrans == matrix_op_t::NoTranspose) { + n_adjusted = std::max(n / 2, 1); + } + } + + // computing column offset + vector<int32_t> col_offsets; + col_offsets.resize(groups * n_adjusted); + for (int g = 0; g < groups; ++g) { + col_offsets_with_zero_pt_s8acc32_ref( + k_per_group, + n_adjusted, + n, + Bint8.data() + g * k_per_group * n, + Bint8_zero_point, + col_offsets.data() + g * n_adjusted); + } + + if (btrans == matrix_op_t::Transpose) { + aligned_vector<int8_t> Bint8_temp(Bint8.size()); + for (int g = 0; g < groups; ++g) { + transpose_matrix( + k_per_group, + n, + Bint8.data() + g * k_per_group * n, + n, + Bint8_temp.data() + g * k_per_group, + groups * k_per_group); + } + Bint8 = Bint8_temp; + } + + for (int g = 0; g < groups; ++g) { + matmul_fp_ref( + m, + n_adjusted, + k_per_group, + k, + n, + groups * n, + Afp32.data() + g * k_per_group, + Bfp32.data() + g * k_per_group * n, + Cfp32_ref.data() + g * n_adjusted); } - } - - // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(n_adjusted); - col_offsets_with_zero_pt_s8acc32_ref( - k_adjusted, - n_adjusted, - n, - Bint8.data(), - Bint8_zero_point, - col_offsets.data()); - - if (btrans == matrix_op_t::Transpose) { - transpose_matrix(Bint8.data(), k, n); - } - - matmul_fp_ref( - m, - n_adjusted, - k_adjusted, - k, - n, - n, - Afp32.data(), - Bfp32.data(), - Cfp32_ref.data()); - - vector<int32_t> row_offset_buf; - row_offset_buf.resize( - PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize()); - - PackAWithQuantRowOffset<uint8_t> packAN( - matrix_op_t::NoTranspose, - m, - k_adjusted, - Afp32.data(), - k, - nullptr, /*buffer for packed matrix*/ - Aint8_scale, - Aint8_zero_point, - 1, /*groups*/ - row_offset_buf.data()); - - PackBMatrix<int8_t> packedBN( - btrans, - k_adjusted, - n_adjusted, - Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, - nullptr, - 1, - Bint8_zero_point); - - DoNothing<float, float> doNothingObj{}; - ReQuantizeForFloat<false> outputProcObj( - doNothingObj, - Aint8_scale, - Bint8_scale, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - nullptr); - - fbgemmPacked( - packAN, - packedBN, - Cfp32_fb.data(), - (int32_t*)Cfp32_fb.data(), - n, - outputProcObj, - 0, - 1); - - float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end()); - float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end()); - float atol = (maximum - minimum) / 255 / 1.9; - - compare_validate_buffers(Cfp32_ref.data(), Cfp32_fb.data(), m, n, n, atol); - } + + vector<int32_t> row_offset_buf; + row_offset_buf.resize( + PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize()); + + PackAWithQuantRowOffset<uint8_t> packAN( + matrix_op_t::NoTranspose, + m, + k, + Afp32.data(), + k, + nullptr, /*buffer for packed matrix*/ + Aint8_scale, + Aint8_zero_point, + groups, + row_offset_buf.data()); + + PackBMatrix<int8_t> packedBN( + btrans, + k, + n_adjusted, + Bint8.data(), + (btrans == matrix_op_t::Transpose) ? k : n, + nullptr, + groups, + Bint8_zero_point); + + DoNothing<float, float> doNothingObj{}; + ReQuantizeForFloat<false> outputProcObj( + doNothingObj, + Aint8_scale, + Bint8_scale, + Aint8_zero_point, + Bint8_zero_point, + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr); + + fbgemmPacked( + packAN, + packedBN, + Cfp32_fb.data(), + reinterpret_cast<int32_t*>(Cfp32_fb.data()), + groups * n, + outputProcObj, + 0, + 1); + + float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end()); + float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end()); + float atol = (maximum - minimum) / 255 / 1.9; + + compare_validate_buffers( + Cfp32_ref.data(), + Cfp32_fb.data(), + m, + groups * n_adjusted, + groups * n, + atol); + } // for each groups + } // for each shape } /** @@ -373,253 +453,121 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { tie(atrans, btrans, test_ld) = GetParam(); for (auto shape : shapes) { - int m = shape[0]; - int n = shape[1]; - int k = shape[2]; - - aligned_vector<float> Afp32(m * k, 0.0f); - aligned_vector<uint8_t> Aint8(m * k, 0); - - aligned_vector<float> Bfp32(k * n, 0.0f); - aligned_vector<int8_t> Bint8(k * n, 0); - - aligned_vector<float> Cfp32_ref(m * n, 0.0f); - aligned_vector<int32_t> Cint32_fb(m * n, 0); - - randFill(Afp32, 0, 255); - for (auto i = 0; i < Afp32.size(); i++) { - Aint8[i] = (uint8_t)Afp32[i]; - } - - // initialize B matrix - randFill(Bfp32, -128, 127); - avoidOverflow(m, n, k, Aint8.data(), Bfp32.data()); - - for (auto i = 0; i < Bfp32.size(); ++i) { - Bint8[i] = (int8_t)Bfp32[i]; - } - - // To test lda != k , we just reduce k by half and use the original k - // as lda. - int m_adjusted = m; - int n_adjusted = n; - int k_adjusted = k; - if (test_ld) { - assert( - atrans == matrix_op_t::NoTranspose && "This case is not handled yet"); - k_adjusted = std::max(k / 2, 1); - if (btrans == matrix_op_t::NoTranspose) { - n_adjusted = std::max(n / 2, 1); - } - } - - if (btrans == matrix_op_t::Transpose) { - transpose_matrix(Bint8.data(), k, n); - } - - matmul_fp_ref( - m, - n_adjusted, - k_adjusted, - k, - n, - n, - Afp32.data(), - Bfp32.data(), - Cfp32_ref.data()); - - DoNothing<int32_t, int32_t> doNothingObj{}; - memCopy<> outputProcObj(doNothingObj); - // A zero point and row offset not required - PackAMatrix<uint8_t> packAN( - matrix_op_t::NoTranspose, m, k_adjusted, Aint8.data(), k); - - // B zero point defaults to 0 - PackBMatrix<int8_t> packedBN( - btrans, - k_adjusted, - n_adjusted, - Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n); - - fbgemmPacked( - packAN, - packedBN, - Cint32_fb.data(), - Cint32_fb.data(), - n, - outputProcObj, - 0, - 1); - - // correctness check - for (int i = 0; i < m_adjusted; ++i) { - for (int j = 0; j < n_adjusted; ++j) { - float expected = Cfp32_ref[i * n + j]; - int32_t actual = Cint32_fb[i * n + j]; - EXPECT_EQ(expected, actual) - << "GEMM results differ at (" << i << ", " << j << "). ref " - << expected << " FBGemm " << actual; + for (int groups : {1, 3, 4}) { + int m = shape[0]; + int n = shape[1]; + int k = shape[2]; + if (k % groups != 0) { + continue; } - } - } -} + int k_per_group = k / groups; -/** - * @brief Unit test for unt8 matrix A, int8 matrix B, and 32-bit - * accumulation. Output processing: requantization with bias -> nothing. - * Asymmetric: the zero point is not 0. - */ -TEST_P(fbgemmu8s8acc32test, TestAsymmetricQuantizedWithBias) { - vector<vector<int>> shapes(GetShapes_()); - matrix_op_t atrans, btrans; - bool test_ld; - tie(atrans, btrans, test_ld) = GetParam(); + aligned_vector<float> Afp32(m * k, 0.0f); + aligned_vector<uint8_t> Aint8(Afp32.size(), 0); - for (auto shape : shapes) { - int m = shape[0]; - int n = shape[1]; - int k = shape[2]; + aligned_vector<float> Bfp32(k * n, 0.0f); + aligned_vector<int8_t> Bint8(Bfp32.size(), 0); - aligned_vector<uint8_t> Aint8(m * k, 0); - aligned_vector<uint8_t> Aint8_ref(m * k, 0); + aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f); + aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size(), 0); - aligned_vector<int8_t> Bint8(k * n, 0); - aligned_vector<int8_t> Bint8_ref(k * n, 0); + randFill(Afp32, 0, 255); + for (auto i = 0; i < Afp32.size(); i++) { + Aint8[i] = (uint8_t)Afp32[i]; + } - aligned_vector<int32_t> Cint32_fb(m * n, 0); - aligned_vector<int32_t> Cint32_ref(m * n, 0); + // initialize B matrix + randFill(Bfp32, -128, 127); + for (int g = 0; g < groups; ++g) { + avoidOverflow( + m, + n, + k_per_group, + Aint8.data() + g * k_per_group, + k, + Bfp32.data() + g * k_per_group * n, + n); + } - aligned_vector<uint8_t> Cint8_fb(m * n, 0); - aligned_vector<uint8_t> Cint8_ref(m * n, 0); + for (auto i = 0; i < Bfp32.size(); ++i) { + Bint8[i] = (int8_t)Bfp32[i]; + } - int n_adjusted = n; - int k_adjusted = k; + // To test lda != k , we just reduce k by half and use the original k + // as lda. + int n_adjusted = n; + if (test_ld) { + assert( + atrans == matrix_op_t::NoTranspose && + "This case is not handled yet"); + if (btrans == matrix_op_t::NoTranspose) { + n_adjusted = std::max(n / 2, 1); + } + } + + if (btrans == matrix_op_t::Transpose) { + aligned_vector<int8_t> Bint8_temp(Bint8.size()); + for (int g = 0; g < groups; ++g) { + transpose_matrix( + k_per_group, + n, + Bint8.data() + g * k_per_group * n, + n, + Bint8_temp.data() + g * k_per_group, + groups * k_per_group); + } + Bint8 = Bint8_temp; + } + + for (int g = 0; g < groups; ++g) { + matmul_fp_ref( + m, + n_adjusted, + k_per_group, + k, + n, + groups * n, + Afp32.data() + g * k_per_group, + Bfp32.data() + g * k_per_group * n, + Cfp32_ref.data() + g * n_adjusted); + } - if (test_ld) { - assert( - atrans == matrix_op_t::NoTranspose && "This case is not handled yet"); - k_adjusted = std::max(k / 2, 1); - if (btrans == matrix_op_t::NoTranspose) { - n_adjusted = std::max(n / 2, 1); + DoNothing<int32_t, int32_t> doNothingObj{}; + memCopy<> outputProcObj(doNothingObj); + // A zero point and row offset not required + PackAMatrix<uint8_t> packAN( + matrix_op_t::NoTranspose, m, k, Aint8.data(), k, nullptr, groups); + + // B zero point defaults to 0 + PackBMatrix<int8_t> packedBN( + btrans, + k, + n_adjusted, + Bint8.data(), + (btrans == matrix_op_t::Transpose) ? k : n, + nullptr, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint32_fb.data(), + Cint32_fb.data(), + groups * n, + outputProcObj, + 0, + 1); + + // correctness check + for (int i = 0; i < m; ++i) { + for (int j = 0; j < groups * n_adjusted; ++j) { + float expected = Cfp32_ref[i * groups * n + j]; + int32_t actual = Cint32_fb[i * groups * n + j]; + EXPECT_EQ(expected, actual) + << "GEMM results differ at (" << i << ", " << j << "). ref " + << expected << " FBGemm " << actual; + } } - } - - // A and B have scale 1, so exactly represented after quantization - randFill(Aint8, 0, 255); - randFill(Bint8, -128, 127); - avoidOverflow(m, n, k, Aint8.data(), Bint8.data()); - - for (auto i = 0; i < Bint8.size(); ++i) { - Bint8_ref[i] = Bint8[i]; - } - - for (auto i = 0; i < Aint8.size(); ++i) { - Aint8_ref[i] = Aint8[i]; - } - - int32_t Aint8_zero_point = 55; - int32_t Bint8_zero_point = -17; - - // initialize bias - aligned_vector<int32_t> bias_int32(n); - randFill(bias_int32, -128, 127); - - if (btrans == matrix_op_t::Transpose) { - transpose_matrix(Bint8.data(), k, n); - } - - // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(n_adjusted); - col_offsets_with_zero_pt_s8acc32_ref( - k_adjusted, - n_adjusted, - n, - Bint8_ref.data(), - Bint8_zero_point, - col_offsets.data()); - - matmul_u8i8acc32_ref( - m, - n_adjusted, - k_adjusted, - k, - n, - n, - Aint8.data(), - Bint8_ref.data(), - Cint32_ref.data()); - - vector<int32_t> row_offsets; - row_offsets.resize(m); - - row_offsets_u8acc32_ref( - m, k_adjusted, k, Aint8_ref.data(), row_offsets.data()); - - float C_multiplier = 0.1234; - int32_t C_zero_pt = 5; - - requantize_u8acc32_ref( - m, - n_adjusted, - n, - Cint32_ref.data(), - Cint8_ref.data(), - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - row_offsets.data(), - col_offsets.data(), - bias_int32.data()); - - vector<int32_t> row_offset_buf; - row_offset_buf.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize()); - - PackAWithRowOffset<uint8_t> packAN( - matrix_op_t::NoTranspose, - m, - k_adjusted, - Aint8.data(), - k, - nullptr, - 1, - Aint8_zero_point, - row_offset_buf.data()); - - PackBMatrix<int8_t> packedBN( - btrans, - k_adjusted, - n_adjusted, - Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, - nullptr, - 1, - Bint8_zero_point); - - DoNothing<> doNothingObj{}; - ReQuantizeOutput<false> outputProcObj( - doNothingObj, - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - bias_int32.data()); - - fbgemmPacked( - packAN, - packedBN, - Cint8_fb.data(), - Cint32_fb.data(), - n, - outputProcObj, - 0, - 1); - - compare_validate_buffers( - Cint8_fb.data(), Cint8_ref.data(), m, n, n, static_cast<uint8_t>(0)); - } + } // for each groups + } // for each shape } diff --git a/test/QuantizationHelpers.cc b/test/QuantizationHelpers.cc index eab08de..cb3edd3 100644 --- a/test/QuantizationHelpers.cc +++ b/test/QuantizationHelpers.cc @@ -17,12 +17,19 @@ namespace fbgemm { * @brief Make sure we won't have overflows from vpmaddubsw instruction. */ template <typename T> -void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B) { +void avoidOverflow( + int m, + int n, + int k, + const uint8_t* Aint8, + int lda, + T* B, + int ldb) { for (int i = 0; i < m; ++i) { for (int j = 0; j < n; ++j) { for (int kk = 0; kk < k / 2 * 2; kk += 2) { - int a0 = Aint8[i * k + kk], a1 = Aint8[i * k + kk + 1]; - int b0 = B[kk * n + j], b1 = B[(kk + 1) * n + j]; + int a0 = Aint8[i * lda + kk], a1 = Aint8[i * lda + kk + 1]; + int b0 = B[kk * ldb + j], b1 = B[(kk + 1) * ldb + j]; int sum_pair = a0 * b0 + a1 * b1; if (sum_pair < numeric_limits<int16_t>::lowest()) { int b1_adjusted = @@ -43,13 +50,23 @@ void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B) { assert( new_sum_pair >= numeric_limits<int16_t>::lowest() && new_sum_pair <= numeric_limits<int16_t>::max()); - B[(kk + 1) * n + j] = b1_adjusted; + B[(kk + 1) * ldb + j] = b1_adjusted; } } } // for each j } // for each i } +template <typename T> +void avoidOverflow( + int m, + int n, + int k, + const uint8_t* Aint8, + T* B) { + return avoidOverflow(m, n, k, Aint8, k, B, n); +} + template void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, int8_t* B); template void diff --git a/test/QuantizationHelpers.h b/test/QuantizationHelpers.h index 42c3e08..d645a4e 100644 --- a/test/QuantizationHelpers.h +++ b/test/QuantizationHelpers.h @@ -13,6 +13,16 @@ namespace fbgemm { * @brief Make sure we won't have overflows from vpmaddubsw instruction. */ template <typename T> +void avoidOverflow( + int m, + int n, + int k, + const uint8_t* Aint8, + int lda, + T* B, + int ldb); + +template <typename T> void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B); } // namespace fbgemm diff --git a/test/TestUtils.cc b/test/TestUtils.cc index 5cc14ef..3200246 100644 --- a/test/TestUtils.cc +++ b/test/TestUtils.cc @@ -79,18 +79,25 @@ template bool check_all_zero_entries<uint8_t>(const uint8_t* test, int m, int n); template <typename T> -void transpose_matrix(T* ref, int n, int k) { - aligned_vector<T> local(n * k, 0); - for (int i = 0; i < n; ++i) { - for (int j = 0; j < k; ++j) { - local[j * n + i] = ref[i * k + j]; +void transpose_matrix( + int M, + int N, + const T* src, + int ld_src, + T* dst, + int ld_dst) { + for (int i = 0; i < N; ++i) { + for (int j = 0; j < M; ++j) { + dst[i * ld_dst + j] = src[i + j * ld_src]; } - } - for (int i = 0; i < k; ++i) { - for (int j = 0; j < n; ++j) { - ref[i * n + j] = local[i * n + j]; - } - } + } // for each output row +} + +template <typename T> +void transpose_matrix(T* ref, int n, int k) { + std::vector<T> local(n * k); + transpose_matrix(n, k, ref, k, local.data(), n); + memcpy(ref, local.data(), n * k * sizeof(T)); } template void transpose_matrix<float>(float* ref, int n, int k); diff --git a/test/TestUtils.h b/test/TestUtils.h index 6cc365f..2cb7b88 100644 --- a/test/TestUtils.h +++ b/test/TestUtils.h @@ -32,9 +32,24 @@ bool check_all_zero_entries(const T* test, int m, int n); /* * @brief In-place transposition for nxk matrix ref. - * @params n number of rows in output (number of columns in input) - * @params k number of columns in output (number of rows in input) + * @params n number of rows in input (number of columns in output) + * @params k number of columns in input (number of rows in output) */ template <typename T> void transpose_matrix(T* ref, int n, int k); + +/* + * @brief Out-of-place transposition for M*N matrix ref. + * @params M number of rows in input + * @params K number of columns in input + */ +template <typename T> +void transpose_matrix( + int M, + int N, + const T* src, + int ld_src, + T* dst, + int ld_dst); + } // namespace fbgemm |