Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaya S Khudia <dskhudia@fb.com>2018-10-13 01:48:13 +0300
committerDaya S Khudia <dskhudia@fb.com>2018-10-31 00:56:00 +0300
commite85b5a12254fa47ca6b56236489253a68fd32104 (patch)
treed62190c53913c65e136fb26dc89bfab38144e2c3 /src/Fbgemm.cc
Initial commit
Diffstat (limited to 'src/Fbgemm.cc')
-rw-r--r--src/Fbgemm.cc363
1 files changed, 363 insertions, 0 deletions
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
new file mode 100644
index 0000000..f3bac97
--- /dev/null
+++ b/src/Fbgemm.cc
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include "fbgemm/Fbgemm.h"
+#include <cpuinfo.h>
+#include <stdexcept>
+#include "ExecuteKernel.h"
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+double packing_time = 0.0;
+double computing_time = 0.0;
+double run_time = 0.0;
+#endif
+
+using namespace fbgemm2;
+
+namespace fbgemm2 {
+
+template <
+ typename packingAMatrix,
+ typename packingBMatrix,
+ typename cT,
+ typename processOutputType>
+void fbgemmPacked(
+ PackMatrix<
+ packingAMatrix,
+ typename packingAMatrix::inpType,
+ typename packingAMatrix::accType>& packA,
+ PackMatrix<
+ packingBMatrix,
+ typename packingBMatrix::inpType,
+ typename packingBMatrix::accType>& packB,
+ cT* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const processOutputType& outProcess,
+ int thread_id,
+ int /* num_threads */) {
+ static_assert(
+ std::is_same<
+ typename packingAMatrix::accType,
+ typename packingBMatrix::accType>::value,
+ "Accumulation type of both matrices should be the same");
+
+ int MCB, KCB;
+
+ // Run time CPU detection
+ if (cpuinfo_initialize()) {
+ if (cpuinfo_has_x86_avx512f()) {
+ MCB = PackingTraits<
+ typename packingAMatrix::inpType,
+ typename packingAMatrix::accType,
+ inst_set_t::avx512>::MCB;
+ KCB = PackingTraits<
+ typename packingAMatrix::inpType,
+ typename packingAMatrix::accType,
+ inst_set_t::avx512>::KCB;
+ } else if (cpuinfo_has_x86_avx2()) {
+ MCB = PackingTraits<
+ typename packingAMatrix::inpType,
+ typename packingAMatrix::accType,
+ inst_set_t::avx2>::MCB;
+ KCB = PackingTraits<
+ typename packingAMatrix::inpType,
+ typename packingAMatrix::accType,
+ inst_set_t::avx2>::KCB;
+ } else {
+ // TODO: Have default slower path
+ assert(0 && "unsupported architecture");
+ return;
+ }
+ } else {
+ throw std::runtime_error("Failed to initialize cpuinfo!");
+ }
+
+ int MDim = packA.numRows();
+ int KDim = packB.numRows();
+
+ int mBlocks = (MDim + MCB - 1) / MCB;
+ int kBlocks = (KDim + KCB - 1) / KCB;
+
+ // remainders
+ int _mc = MDim % MCB;
+ int _kc = KDim % KCB;
+
+ int kc, mc;
+
+ block_type_t blockA{0, 0, 0, 0};
+
+ // B must be prepacked
+ assert(packB.isPrePacked() && "B matrix must be prepacked");
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ std::chrono::time_point<std::chrono::high_resolution_clock> t_very_start,
+ t_start, t_end;
+ double dt;
+ t_start = std::chrono::high_resolution_clock::now();
+ t_very_start = std::chrono::high_resolution_clock::now();
+#endif
+
+ ExecuteKernel<packingAMatrix, packingBMatrix, cT, processOutputType>
+ exeKernelObj(packA, packB, 0, C, C_buffer, ldc, outProcess);
+ // ToDo: thread based work division
+ for (int i = 0; i < mBlocks; ++i) {
+ mc = (i != mBlocks - 1 || _mc == 0) ? MCB : _mc;
+ for (int k = 0; k < kBlocks; ++k) {
+ kc = (k != kBlocks - 1 || _kc == 0) ? KCB : _kc;
+ // pack A matrix
+ blockA = {i * MCB, mc, k * KCB, kc};
+
+ packA.pack(blockA);
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ t_end = std::chrono::high_resolution_clock::now();
+ dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
+ .count();
+ packing_time += (dt);
+ t_start = std::chrono::high_resolution_clock::now();
+#endif
+
+ exeKernelObj.execute(k);
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ t_end = std::chrono::high_resolution_clock::now();
+ dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
+ .count();
+ computing_time += (dt);
+ t_start = std::chrono::high_resolution_clock::now();
+#endif
+ }
+ }
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ t_end = std::chrono::high_resolution_clock::now();
+ dt =
+ std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_very_start)
+ .count();
+ run_time += (dt);
+ t_start = std::chrono::high_resolution_clock::now();
+#endif
+}
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeOutput<false>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeOutput<true>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
+ packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ float* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeForFloat<false>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
+ packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ float* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeForFloat<true>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAMatrix<uint8_t, int32_t>, uint8_t, int32_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ int32_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const memCopy<>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ float* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeForFloat<false>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ float* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeForFloat<true>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ int32_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const memCopy<>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ int32_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const memCopy<>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
+ packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
+ int32_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const memCopy<>& outProcess,
+ int thread_id,
+ int num_threads);
+
+// 16 bit accumulation functions
+template void fbgemmPacked(
+ PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ int32_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const memCopy<>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeOutput<false>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<false>>&
+ outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<true>>&
+ outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ float* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const DoSpmdmOnInpBuffer<float, int32_t, ReQuantizeForFloat<false>>&
+ outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeOutput<false>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ uint8_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeOutput<true>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ int32_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const memCopy<>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ int32_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const memCopy<>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ int32_t* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const DoNothing<int32_t, int32_t>& outProcess,
+ int thread_id,
+ int num_threads);
+
+template void fbgemmPacked(
+ PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ float* C,
+ int32_t* C_buffer,
+ uint32_t ldc,
+ const ReQuantizeForFloat<false>& outProcess,
+ int thread_id,
+ int num_threads);
+
+} // namespace fbgemm2