diff options
author | Jongsoo Park <jongsoo@fb.com> | 2018-11-29 06:41:59 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-11-29 06:44:08 +0300 |
commit | 027de07a11a0460fd1daffb026d50dba0e56eb79 (patch) | |
tree | 5e0f497059d6a22b18de8508c2031ffdbc9f52d3 /src | |
parent | 90535d3da35f9d3da6a8dbd62da0c68d01696924 (diff) |
sparse convolution output processing (#27)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/27
DoSpmdmOnInpBuffer can't be used together with PackAWithIm2Col because DoSpmdmOnInpBuffer expects im2col'ed A matrix. This diff implements DoSConvOnInpBuffer that does sparse convolution directly on A input without im2col. The performance is well optimized and need to see if this implementation is good enough to get good resnet50 performance.
Reviewed By: dskhudia
Differential Revision: D13192336
fbshipit-source-id: 2076555ba9749e111afbaec408a2bfa0f55bd5bc
Diffstat (limited to 'src')
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 18 | ||||
-rw-r--r-- | src/Fbgemm.cc | 25 | ||||
-rw-r--r-- | src/FbgemmI8Spmdm.cc | 72 |
3 files changed, 112 insertions, 3 deletions
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index f1ec882..152d7f1 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -381,6 +381,24 @@ INSTANTIATE_Q_GRANS(true); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE +#define INSTANTIATE_BASE(RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, int16_t>, \ + PackBMatrix<int8_t, int16_t>, \ + uint8_t, \ + DoSConvOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>; + +#define INSTANTIATE_Q_GRANS(RELU) \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); + +INSTANTIATE_Q_GRANS(false); +INSTANTIATE_Q_GRANS(true); + +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + template class ExecuteKernel< PackAWithRowOffset<uint8_t, int16_t>, PackBMatrix<int8_t, int16_t>, diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index a8bf02f..6623fe7 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -376,6 +376,31 @@ INSTANTIATE_Q_GRANS(true); #undef INSTANTIATE_Q_GRANS #undef INSTANTIATE_BASE +#define INSTANTIATE_BASE(RELU, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA, \ + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const DoSConvOnInpBuffer< \ + uint8_t, \ + int32_t, \ + ReQuantizeOutput<RELU, Q_GRAN>>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(RELU) \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); + +INSTANTIATE_Q_GRANS(false); +INSTANTIATE_Q_GRANS(true); + +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + template void fbgemmPacked( PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc index 12e1cb2..125b26d 100644 --- a/src/FbgemmI8Spmdm.cc +++ b/src/FbgemmI8Spmdm.cc @@ -21,6 +21,7 @@ double spmdm_transpose_32xN_time = 0.0; double spmdm_compute_time = 0.0; double spmdm_transpose_Nx32_time = 0.0; double spmdm_run_time = 0.0; +double sconv_run_time = 0.0; #endif using namespace std; @@ -222,8 +223,8 @@ void CompressedSparseColumn::SpMDM( t_very_start = std::chrono::high_resolution_clock::now(); #endif - uint8_t A_buffer[K * 32] __attribute__((aligned(64))); - int32_t C_buffer[N * 32] __attribute__((aligned(64))); + alignas(64) uint8_t A_buffer[K * 32]; + alignas(64) int32_t C_buffer[N * 32]; // If we compute C = C + A * B, where B is a sparse matrix in CSC format, for // each non-zero in B, we'd need to access the corresponding column in A. @@ -269,7 +270,7 @@ void CompressedSparseColumn::SpMDM( for (int i1 = block.row_start; i1 < i_end; i1 += 32) { // Transpose 32 x K submatrix of A if (i_end - i1 < 32) { - uint8_t A_temp_buffer[K * 32] __attribute__((aligned(64))); + alignas(64) uint8_t A_temp_buffer[K * 32]; for (int i2 = 0; i2 < (i_end - i1) / 8 * 8; i2 += 8) { transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32); } @@ -505,4 +506,69 @@ void CompressedSparseColumn::SpMDM( #endif } +void CompressedSparseColumn::SparseConv( + const conv_param_t<>& conv_p, + const block_type_t& block, + const uint8_t* A, + int32_t A_zero_point, + bool accumulation, + int32_t* C, + int ldc) const { + int K = NumOfRows(); + int N = block.col_size; + + if (K == 0 || N == 0) { + return; + } + +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + std::chrono::time_point<std::chrono::high_resolution_clock> t_start, t_end; + double dt; + t_start = std::chrono::high_resolution_clock::now(); +#endif + + // TODO: if not hyper sparse, transpose a block of A matrix as in SpMDM. + if (!accumulation) { + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + for (int j = block.col_start; j < block.col_start + block.col_size; + ++j) { + C[(i - block.row_start) * ldc + j - block.col_start] = 0; + } + } + } + for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { + for (int k = colptr_[j]; k < colptr_[j + 1]; ++k) { + int v = values_[k]; + for (int i = block.row_start; i < block.row_start + block.row_size; + ++i) { + int ow = i % conv_p.OUT_DIM[1]; + int oh = i / conv_p.OUT_DIM[1] % conv_p.OUT_DIM[0]; + int n = i / conv_p.OUT_DIM[1] / conv_p.OUT_DIM[0]; + assert(n < conv_p.MB); + int iw = -conv_p.pad[1] + ow * conv_p.stride[1] + kw_[k]; + int ih = -conv_p.pad[0] + oh * conv_p.stride[0] + kh_[k]; + + if (ih >= 0 && ih < conv_p.IN_DIM[0] && iw >= 0 && + iw < conv_p.IN_DIM[1]) { + C[(i - block.row_start) * ldc + j - block.col_start] += + A[((n * conv_p.IN_DIM[0] + ih) * conv_p.IN_DIM[1] + iw) * + conv_p.IC + + ic_[k]] * + v; + } else { + C[(i - block.row_start) * ldc + j - block.col_start] += + A_zero_point * v; + } + } + } + } // for each column of B + +#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN + t_end = std::chrono::high_resolution_clock::now(); + dt = std::chrono::duration_cast<std::chrono::nanoseconds>(t_end - t_start) + .count(); + sconv_run_time += (dt); +#endif +} + } // namespace fbgemm |