Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/FbgemmConv.cc')
-rw-r--r--src/FbgemmConv.cc247
1 files changed, 200 insertions, 47 deletions
diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
index 5db63f6..de833d2 100644
--- a/src/FbgemmConv.cc
+++ b/src/FbgemmConv.cc
@@ -6,8 +6,9 @@
*/
#include <algorithm>
-#include <iostream>
+#include <numeric>
#include <vector>
+#include <functional>
#include "fbgemm/Fbgemm.h"
namespace fbgemm {
@@ -33,12 +34,24 @@ bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
});
}
+template <int SPATIAL_DIM>
+bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
+ return std::accumulate(conv_p.K.begin(), conv_p.K.end(), 0) == SPATIAL_DIM &&
+ std::accumulate(conv_p.stride.begin(), conv_p.stride.end(), 0) ==
+ SPATIAL_DIM &&
+ std::accumulate(conv_p.dilation.begin(), conv_p.dilation.end(), 0) ==
+ SPATIAL_DIM &&
+ std::accumulate(conv_p.pad.begin(), conv_p.pad.end(), 0) == 0;
+}
+
template <int SPATIAL_DIM, typename ACC_T>
optimized_conv_t ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p) {
if (takeDepthWiseFastPath<SPATIAL_DIM, ACC_T>(conv_p)) {
return optimized_conv_t::depthwise;
} else if (fbgemmOptimizedGConv<SPATIAL_DIM>(conv_p)) {
return optimized_conv_t::groupwise;
+ } else if (takePointWiseFastPath<SPATIAL_DIM>(conv_p)) {
+ return optimized_conv_t::pointwise;
} else {
return optimized_conv_t::im2col;
}
@@ -58,58 +71,139 @@ int fbgemmConv(
static_assert(
SPATIAL_DIM == 2 || SPATIAL_DIM == 3,
"Only 2D and 3D convolutions are supported");
+
+ if (!packed_weights.isPackingCompliant(conv_p)) {
+ std::string msg =
+ "[FBGEMM_CONV_ERROR] Convolution parameters "
+ "mismatch between pre-packed weights and conv invocation! ";
+ msg += packed_weights.mismatchingParams(conv_p);
+ msg += std::string(
+ " Please pack weights using the same parameters "
+ "with which convolution operation is invoked!");
+ throw std::logic_error(msg);
+ }
+
switch (ConvFastPath<SPATIAL_DIM, ACC_T>(conv_p)) {
case optimized_conv_t::depthwise: {
// 2D and 3D depthwise fast path
// std::cout << "Depthwise fast path" << std::endl;
const std::int32_t* B_zero_point = outProcess.getBZeroPoint();
const float* C_multiplier = outProcess.getCMultiplier();
+ const float* act_times_w_scale = outProcess.getActWScale();
if (SPATIAL_DIM == 3) {
static_assert(
std::is_same<typename processOutputType::outType, std::uint8_t>::
value,
"For depthwise, only requantized output is supported");
- depthwise_3x3x3_pad_1(
- conv_p.MB, // mini batch
- conv_p.IN_DIM[0], // T
- conv_p.IN_DIM[1], // H
- conv_p.IN_DIM[2], // W
- conv_p.OC, // output channels
- conv_p.stride[0], // stride_t
- conv_p.stride[1], // stride_h
- conv_p.stride[2], // stride_w
- outProcess.getAZeroPoint(),
- activations,
- B_zero_point[0],
- *(packed_weights.getPackedWFor3DDW()),
- C_multiplier[0],
- outProcess.getCZeroPoint(),
- out,
- outProcess.getColOffsets(),
- outProcess.getBias(),
- outProcess.RELU_FUSED, // fuse_relu
- thread_id,
- num_threads);
+
+ if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+ depthwise_3x3x3_pad_1(
+ conv_p.MB, // mini batch
+ conv_p.IN_DIM[0], // T
+ conv_p.IN_DIM[1], // H
+ conv_p.IN_DIM[2], // W
+ conv_p.OC, // output channels
+ conv_p.stride[0], // stride_t
+ conv_p.stride[1], // stride_h
+ conv_p.stride[2], // stride_w
+ outProcess.getAZeroPoint(),
+ activations,
+ B_zero_point[0],
+ *(packed_weights.getPackedWForDepthwise()),
+ C_multiplier[0],
+ outProcess.getCZeroPoint(),
+ out,
+ outProcess.getColOffsets(),
+ outProcess.getBias(),
+ outProcess.RELU_FUSED, // fuse_relu
+ act_times_w_scale ? act_times_w_scale[0] : 1.0f,
+ thread_id,
+ num_threads);
+ } else if (
+ processOutputType::QGRANType ==
+ QuantizationGranularity::OUT_CHANNEL ||
+ processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+ depthwise_3x3x3_per_channel_quantization_pad_1(
+ conv_p.MB, // mini batch
+ conv_p.IN_DIM[0], // T
+ conv_p.IN_DIM[1], // H
+ conv_p.IN_DIM[2], // W
+ conv_p.OC, // output channels
+ conv_p.stride[0], // stride_t
+ conv_p.stride[1], // stride_h
+ conv_p.stride[2], // stride_w
+ outProcess.getAZeroPoint(),
+ activations,
+ B_zero_point,
+ *(packed_weights.getPackedWForDepthwise()),
+ C_multiplier,
+ outProcess.getCZeroPoint(),
+ out,
+ outProcess.getColOffsets(),
+ outProcess.getBias(),
+ outProcess.RELU_FUSED, // fuse_relu
+ outProcess.getActWScale(), // act_scale * weight_scale
+ thread_id,
+ num_threads);
+ } else {
+ std::string msg =
+ "[FBGEMM_CONV_ERROR] This quantization granularity is "
+ "not supported";
+ throw std::runtime_error(msg);
+ }
} else {
- depthwise_3x3_pad_1(
- conv_p.MB, // mini batch
- conv_p.IN_DIM[0], // H
- conv_p.IN_DIM[1], // W
- conv_p.OC, // output channels
- conv_p.stride[0], // stride_h
- conv_p.stride[1], // stride_w
- outProcess.getAZeroPoint(),
- activations,
- B_zero_point[0],
- *(packed_weights.getPackedWFor2DDW()),
- C_multiplier[0],
- outProcess.getCZeroPoint(),
- out,
- outProcess.getColOffsets(),
- outProcess.getBias(),
- outProcess.RELU_FUSED, // fuse_relu
- thread_id,
- num_threads);
+ if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+ depthwise_3x3_pad_1(
+ conv_p.MB, // mini batch
+ conv_p.IN_DIM[0], // H
+ conv_p.IN_DIM[1], // W
+ conv_p.OC, // output channels
+ conv_p.stride[0], // stride_h
+ conv_p.stride[1], // stride_w
+ outProcess.getAZeroPoint(),
+ activations,
+ B_zero_point[0],
+ *(packed_weights.getPackedWForDepthwise()),
+ C_multiplier[0],
+ outProcess.getCZeroPoint(),
+ out,
+ outProcess.getColOffsets(),
+ outProcess.getBias(),
+ outProcess.RELU_FUSED, // fuse_relu
+ act_times_w_scale ? act_times_w_scale[0] : 1.0f,
+ thread_id,
+ num_threads);
+ } else if (
+ processOutputType::QGRANType ==
+ QuantizationGranularity::OUT_CHANNEL ||
+ processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+ // The number of channels == groups for depthwise convolutions
+ depthwise_3x3_per_channel_quantization_pad_1(
+ conv_p.MB, // mini batch
+ conv_p.IN_DIM[0], // H
+ conv_p.IN_DIM[1], // W
+ conv_p.OC, // output channels
+ conv_p.stride[0], // stride_h
+ conv_p.stride[1], // stride_w
+ outProcess.getAZeroPoint(),
+ activations,
+ B_zero_point,
+ *(packed_weights.getPackedWForDepthwise()),
+ C_multiplier,
+ outProcess.getCZeroPoint(),
+ out,
+ outProcess.getColOffsets(),
+ outProcess.getBias(),
+ outProcess.RELU_FUSED, // fuse_relu
+ outProcess.getActWScale(), // act_scale * weight_scale
+ thread_id,
+ num_threads);
+ } else {
+ std::string msg =
+ "[FBGEMM_CONV_ERROR] This quantization granularity is "
+ "not supported";
+ throw std::runtime_error(msg);
+ }
}
break;
}
@@ -134,14 +228,68 @@ int fbgemmConv(
num_threads);
break;
}
+ case optimized_conv_t::pointwise: {
+ std::vector<int32_t> row_offset_buf(
+ PackAWithRowOffset<uint8_t>::rowOffsetBufferSize(blocking_params));
+ int image_dim = std::accumulate(
+ conv_p.IN_DIM.begin(),
+ conv_p.IN_DIM.end(),
+ 1,
+ std::multiplies<int>());
+ PackAWithRowOffset<uint8_t, ACC_T> packA(
+ matrix_op_t::NoTranspose,
+ conv_p.MB * image_dim,
+ conv_p.IC,
+ activations,
+ conv_p.IC,
+ nullptr,
+ conv_p.G,
+ row_offset_buf.data(),
+ blocking_params);
+
+ outProcess.setRowOffsets(row_offset_buf.data());
+ fbgemmPacked(
+ packA,
+ *(packed_weights.getPackedWForPointwise()),
+ out,
+ outBuffer,
+ conv_p.OC,
+ outProcess,
+ thread_id,
+ num_threads,
+ blocking_params);
+ break;
+ }
case optimized_conv_t::im2col: {
// All other convolutions go through im2col-based implementation
// std::cout << "Im2col path" << std::endl;
std::vector<int32_t> row_offset_buf(
- PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize());
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize(
+ blocking_params));
const std::int32_t* b_zero_point = outProcess.getBZeroPoint();
- bool b_symmetric = b_zero_point[0] == 0;
+ bool b_symmetric = false;
+ if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+ b_symmetric = b_zero_point[0] == 0;
+ } else if (
+ processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+ b_symmetric =
+ std::all_of(b_zero_point, b_zero_point + conv_p.G, [](int i) {
+ return i == 0;
+ });
+ } else if (
+ processOutputType::QGRANType ==
+ QuantizationGranularity::OUT_CHANNEL) {
+ b_symmetric =
+ std::all_of(b_zero_point, b_zero_point + conv_p.OC, [](int i) {
+ return i == 0;
+ });
+ } else {
+ std::string msg =
+ "[FBGEMM_CONV_ERROR] This quantization granularity is "
+ "not supported";
+ throw std::runtime_error(msg);
+ }
PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM> packA(
conv_p,
activations,
@@ -169,21 +317,25 @@ int fbgemmConv(
return 0;
}
-#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \
+#define INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, BIAS_TYPE) \
template int fbgemmConv( \
const conv_param_t<SPATIAL_DIM>& conv_p, \
const std::uint8_t* activations, \
PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights, \
std::uint8_t* out, \
std::int32_t* outBuffer, \
- ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \
+ ReQuantizeOutput<RELU, Q_GRAN, BIAS_TYPE>& outProcess, \
int thread_id, \
int num_threads, \
const BlockingFactors* blocking_params);
+#define INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, SPATIAL_DIM) \
+ INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, float); \
+ INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, SPATIAL_DIM, int32_t);
+
#define INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, RELU) \
- INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 2); \
- INSTANTIATE_BASE(ACC_T, Q_GRAN, RELU, 3);
+ INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 2); \
+ INSTANTIATE_BIAS_T(ACC_T, Q_GRAN, RELU, 3);
#define INSTANTIATE_RELU(ACC_T, Q_GRAN) \
INSTANTIATE_SPATIAL_DIM(ACC_T, Q_GRAN, true); \
@@ -199,6 +351,7 @@ INSTANTIATE_Q_GRANS(std::int32_t);
#undef INSTANTIATE_Q_GRANS
#undef INSTANTIATE_RELU
#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BIAS_T
#undef INSTANTIATE_BASE
template bool takeDepthWiseFastPath<2, std::int32_t>(