Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJongsoo Park <jongsoo@fb.com>2018-11-29 06:41:59 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2018-11-29 06:44:08 +0300
commit027de07a11a0460fd1daffb026d50dba0e56eb79 (patch)
tree5e0f497059d6a22b18de8508c2031ffdbc9f52d3 /src
parent90535d3da35f9d3da6a8dbd62da0c68d01696924 (diff)
sparse convolution output processing (#27)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/27 DoSpmdmOnInpBuffer can't be used together with PackAWithIm2Col because DoSpmdmOnInpBuffer expects im2col'ed A matrix. This diff implements DoSConvOnInpBuffer that does sparse convolution directly on A input without im2col. The performance is well optimized and need to see if this implementation is good enough to get good resnet50 performance. Reviewed By: dskhudia Differential Revision: D13192336 fbshipit-source-id: 2076555ba9749e111afbaec408a2bfa0f55bd5bc
Diffstat (limited to 'src')
-rw-r--r--src/ExecuteKernelU8S8.cc18
-rw-r--r--src/Fbgemm.cc25
-rw-r--r--src/FbgemmI8Spmdm.cc72
3 files changed, 112 insertions, 3 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index f1ec882..152d7f1 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -381,6 +381,24 @@ INSTANTIATE_Q_GRANS(true);
#undef INSTANTIATE_Q_GRANS
#undef INSTANTIATE_BASE
+#define INSTANTIATE_BASE(RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, int16_t>, \
+ PackBMatrix<int8_t, int16_t>, \
+ uint8_t, \
+ DoSConvOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>;
+
+#define INSTANTIATE_Q_GRANS(RELU) \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
+
+INSTANTIATE_Q_GRANS(false);
+INSTANTIATE_Q_GRANS(true);
+
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
template class ExecuteKernel<
PackAWithRowOffset<uint8_t, int16_t>,
PackBMatrix<int8_t, int16_t>,
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index a8bf02f..6623fe7 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -376,6 +376,31 @@ INSTANTIATE_Q_GRANS(true);
#undef INSTANTIATE_Q_GRANS
#undef INSTANTIATE_BASE
+#define INSTANTIATE_BASE(RELU, Q_GRAN) \
+ template void fbgemmPacked( \
+ PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA, \
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, \
+ uint8_t* C, \
+ int32_t* C_buffer, \
+ uint32_t ldc, \
+ const DoSConvOnInpBuffer< \
+ uint8_t, \
+ int32_t, \
+ ReQuantizeOutput<RELU, Q_GRAN>>& outProcess, \
+ int thread_id, \
+ int num_threads);
+
+#define INSTANTIATE_Q_GRANS(RELU) \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
+
+INSTANTIATE_Q_GRANS(false);
+INSTANTIATE_Q_GRANS(true);
+
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
template void fbgemmPacked(
PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc
index 12e1cb2..125b26d 100644
--- a/src/FbgemmI8Spmdm.cc
+++ b/src/FbgemmI8Spmdm.cc
@@ -21,6 +21,7 @@ double spmdm_transpose_32xN_time = 0.0;
double spmdm_compute_time = 0.0;
double spmdm_transpose_Nx32_time = 0.0;
double spmdm_run_time = 0.0;
+double sconv_run_time = 0.0;
#endif
using namespace std;
@@ -222,8 +223,8 @@ void CompressedSparseColumn::SpMDM(
t_very_start = std::chrono::high_resolution_clock::now();
#endif
- uint8_t A_buffer[K * 32] __attribute__((aligned(64)));
- int32_t C_buffer[N * 32] __attribute__((aligned(64)));
+ alignas(64) uint8_t A_buffer[K * 32];
+ alignas(64) int32_t C_buffer[N * 32];
// If we compute C = C + A * B, where B is a sparse matrix in CSC format, for
// each non-zero in B, we'd need to access the corresponding column in A.
@@ -269,7 +270,7 @@ void CompressedSparseColumn::SpMDM(
for (int i1 = block.row_start; i1 < i_end; i1 += 32) {
// Transpose 32 x K submatrix of A
if (i_end - i1 < 32) {
- uint8_t A_temp_buffer[K * 32] __attribute__((aligned(64)));
+ alignas(64) uint8_t A_temp_buffer[K * 32];
for (int i2 = 0; i2 < (i_end - i1) / 8 * 8; i2 += 8) {
transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32);
}
@@ -505,4 +506,69 @@ void CompressedSparseColumn::SpMDM(
#endif
}
+void CompressedSparseColumn::SparseConv(
+ const conv_param_t<>& conv_p,
+ const block_type_t& block,
+ const uint8_t* A,
+ int32_t A_zero_point,
+ bool accumulation,
+ int32_t* C,
+ int ldc) const {
+ int K = NumOfRows();
+ int N = block.col_size;
+
+ if (K == 0 || N == 0) {
+ return;
+ }
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ std::chrono::time_point<std::chrono::high_resolution_clock> t_start, t_end;
+ double dt;
+ t_start = std::chrono::high_resolution_clock::now();
+#endif
+
+ // TODO: if not hyper sparse, transpose a block of A matrix as in SpMDM.
+ if (!accumulation) {
+ for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+ for (int j = block.col_start; j < block.col_start + block.col_size;
+ ++j) {
+ C[(i - block.row_start) * ldc + j - block.col_start] = 0;
+ }
+ }
+ }
+ for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
+ for (int k = colptr_[j]; k < colptr_[j + 1]; ++k) {
+ int v = values_[k];
+ for (int i = block.row_start; i < block.row_start + block.row_size;
+ ++i) {
+ int ow = i % conv_p.OUT_DIM[1];
+ int oh = i / conv_p.OUT_DIM[1] % conv_p.OUT_DIM[0];
+ int n = i / conv_p.OUT_DIM[1] / conv_p.OUT_DIM[0];
+ assert(n < conv_p.MB);
+ int iw = -conv_p.pad[1] + ow * conv_p.stride[1] + kw_[k];
+ int ih = -conv_p.pad[0] + oh * conv_p.stride[0] + kh_[k];
+
+ if (ih >= 0 && ih < conv_p.IN_DIM[0] && iw >= 0 &&
+ iw < conv_p.IN_DIM[1]) {
+ C[(i - block.row_start) * ldc + j - block.col_start] +=
+ A[((n * conv_p.IN_DIM[0] + ih) * conv_p.IN_DIM[1] + iw) *
+ conv_p.IC +
+ ic_[k]] *
+ v;
+ } else {
+ C[(i - block.row_start) * ldc + j - block.col_start] +=
+ A_zero_point * v;
+ }
+ }
+ }
+ } // for each column of B
+
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+ t_end = std::chrono::high_resolution_clock::now();
+ dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start)
+ .count();
+ sconv_run_time += (dt);
+#endif
+}
+
} // namespace fbgemm