diff options
author | Daya S Khudia <dskhudia@fb.com> | 2018-10-13 01:48:13 +0300 |
---|---|---|
committer | Daya S Khudia <dskhudia@fb.com> | 2018-10-31 00:56:00 +0300 |
commit | e85b5a12254fa47ca6b56236489253a68fd32104 (patch) | |
tree | d62190c53913c65e136fb26dc89bfab38144e2c3 /src/Fbgemm.cc |
Initial commit
Diffstat (limited to 'src/Fbgemm.cc')
-rw-r--r-- | src/Fbgemm.cc | 363 |
1 files changed, 363 insertions, 0 deletions
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc new file mode 100644 index 0000000..f3bac97 --- /dev/null +++ b/src/Fbgemm.cc @@ -0,0 +1,363 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include "fbgemm/Fbgemm.h" +#include <cpuinfo.h> +#include <stdexcept> +#include "ExecuteKernel.h" + +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN +double packing_time = 0.0; +double computing_time = 0.0; +double run_time = 0.0; +#endif + +using namespace fbgemm2; + +namespace fbgemm2 { + +template < + typename packingAMatrix, + typename packingBMatrix, + typename cT, + typename processOutputType> +void fbgemmPacked( + PackMatrix< + packingAMatrix, + typename packingAMatrix::inpType, + typename packingAMatrix::accType>& packA, + PackMatrix< + packingBMatrix, + typename packingBMatrix::inpType, + typename packingBMatrix::accType>& packB, + cT* C, + int32_t* C_buffer, + uint32_t ldc, + const processOutputType& outProcess, + int thread_id, + int /* num_threads */) { + static_assert( + std::is_same< + typename packingAMatrix::accType, + typename packingBMatrix::accType>::value, + "Accumulation type of both matrices should be the same"); + + int MCB, KCB; + + // Run time CPU detection + if (cpuinfo_initialize()) { + if (cpuinfo_has_x86_avx512f()) { + MCB = PackingTraits< + typename packingAMatrix::inpType, + typename packingAMatrix::accType, + inst_set_t::avx512>::MCB; + KCB = PackingTraits< + typename packingAMatrix::inpType, + typename packingAMatrix::accType, + inst_set_t::avx512>::KCB; + } else if (cpuinfo_has_x86_avx2()) { + MCB = PackingTraits< + typename packingAMatrix::inpType, + typename packingAMatrix::accType, + inst_set_t::avx2>::MCB; + KCB = PackingTraits< + typename packingAMatrix::inpType, + typename packingAMatrix::accType, + inst_set_t::avx2>::KCB; + } else { + // TODO: Have default slower path + assert(0 && "unsupported architecture"); + return; + } + } else { + throw std::runtime_error("Failed to initialize cpuinfo!"); + } + + int MDim = packA.numRows(); + int KDim = packB.numRows(); + + int mBlocks = (MDim + MCB - 1) / MCB; + int kBlocks = (KDim + KCB - 1) / KCB; + + // remainders + int _mc = MDim % MCB; + int _kc = KDim % KCB; + + int kc, mc; + + block_type_t blockA{0, 0, 0, 0}; + + // B must be prepacked + assert(packB.isPrePacked() && "B matrix must be prepacked"); + +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + std::chrono::time_point<std::chrono::high_resolution_clock> t_very_start, + t_start, t_end; + double dt; + t_start = std::chrono::high_resolution_clock::now(); + t_very_start = std::chrono::high_resolution_clock::now(); +#endif + + ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType> + exeKernelObj(packA, packB, 0, C, C_buffer, ldc, outProcess); + // ToDo: thread based work division + for (int i = 0; i < mBlocks; ++i) { + mc = (i != mBlocks - 1 || _mc == 0) ? MCB : _mc; + for (int k = 0; k < kBlocks; ++k) { + kc = (k != kBlocks - 1 || _kc == 0) ? KCB : _kc; + // pack A matrix + blockA = {i * MCB, mc, k * KCB, kc}; + + packA.pack(blockA); + +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + t_end = std::chrono::high_resolution_clock::now(); + dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) + .count(); + packing_time += (dt); + t_start = std::chrono::high_resolution_clock::now(); +#endif + + exeKernelObj.execute(k); + +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + t_end = std::chrono::high_resolution_clock::now(); + dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) + .count(); + computing_time += (dt); + t_start = std::chrono::high_resolution_clock::now(); +#endif + } + } + +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + t_end = std::chrono::high_resolution_clock::now(); + dt = + std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_very_start) + .count(); + run_time += (dt); + t_start = std::chrono::high_resolution_clock::now(); +#endif +} + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeOutput<false>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeOutput<true>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& + packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, + float* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeForFloat<false>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& + packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, + float* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeForFloat<true>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAMatrix<uint8_t, int32_t>, uint8_t, int32_t>& packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, + int32_t* C, + int32_t* C_buffer, + uint32_t ldc, + const memCopy<>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, + float* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeForFloat<false>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, + float* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeForFloat<true>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, + int32_t* C, + int32_t* C_buffer, + uint32_t ldc, + const memCopy<>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, + int32_t* C, + int32_t* C_buffer, + uint32_t ldc, + const memCopy<>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& + packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, + int32_t* C, + int32_t* C_buffer, + uint32_t ldc, + const memCopy<>& outProcess, + int thread_id, + int num_threads); + +// 16 bit accumulation functions +template void fbgemmPacked( + PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + int32_t* C, + int32_t* C_buffer, + uint32_t ldc, + const memCopy<>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeOutput<false>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<false>>& + outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<true>>& + outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + float* C, + int32_t* C_buffer, + uint32_t ldc, + const DoSpmdmOnInpBuffer<float, int32_t, ReQuantizeForFloat<false>>& + outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeOutput<false>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeOutput<true>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + int32_t* C, + int32_t* C_buffer, + uint32_t ldc, + const memCopy<>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + int32_t* C, + int32_t* C_buffer, + uint32_t ldc, + const memCopy<>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + int32_t* C, + int32_t* C_buffer, + uint32_t ldc, + const DoNothing<int32_t, int32_t>& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + float* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeForFloat<false>& outProcess, + int thread_id, + int num_threads); + +} // namespace fbgemm2 |