Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaya Khudia <dskhudia@fb.com>2019-08-21 02:52:53 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-08-21 02:58:08 +0300
commit280fa17349b763eb474c423a6d1172f81df29103 (patch)
tree061974b09a412e0025c11a456451c81efe7e4ab0
parenta6d1d3eed7ba858d4532fc297b7a4ee984e6e7e3 (diff)
Per channel support in fbgemmConv (#119)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/119 Some paths in fbgemmConv had missing support for per channel quantization. Adding support for per channel as well as groupwise quantization support with this diff. Reviewed By: jianyuh Differential Revision: D16894740 fbshipit-source-id: 43a2c08d1c8d1b01775f875224774c39fae280bc
-rw-r--r--include/fbgemm/Fbgemm.h1
-rw-r--r--src/FbgemmConv.cc171
-rw-r--r--test/UniConvTest.cc269
3 files changed, 398 insertions, 43 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index 543f1cb..0b7bf1f 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -1134,6 +1134,7 @@ template <
class FBGEMM_API ReQuantizeOutput {
public:
static constexpr int RELU_FUSED = FUSE_RELU;
+ static constexpr QuantizationGranularity QGRANType = Q_GRAN;
using outType = outT;
using inpType = inT;
/**
diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
index 33d1535..164411d 100644
--- a/src/FbgemmConv.cc
+++ b/src/FbgemmConv.cc
@@ -94,47 +94,111 @@ int fbgemmConv(
std::is_same<typename processOutputType::outType, std::uint8_t>::
value,
"For depthwise, only requantized output is supported");
- depthwise_3x3x3_pad_1(
- conv_p.MB, // mini batch
- conv_p.IN_DIM[0], // T
- conv_p.IN_DIM[1], // H
- conv_p.IN_DIM[2], // W
- conv_p.OC, // output channels
- conv_p.stride[0], // stride_t
- conv_p.stride[1], // stride_h
- conv_p.stride[2], // stride_w
- outProcess.getAZeroPoint(),
- activations,
- B_zero_point[0],
- *(packed_weights.getPackedWFor3DDW()),
- C_multiplier[0],
- outProcess.getCZeroPoint(),
- out,
- outProcess.getColOffsets(),
- outProcess.getBias(),
- outProcess.RELU_FUSED, // fuse_relu
- thread_id,
- num_threads);
+
+ if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+ depthwise_3x3x3_pad_1(
+ conv_p.MB, // mini batch
+ conv_p.IN_DIM[0], // T
+ conv_p.IN_DIM[1], // H
+ conv_p.IN_DIM[2], // W
+ conv_p.OC, // output channels
+ conv_p.stride[0], // stride_t
+ conv_p.stride[1], // stride_h
+ conv_p.stride[2], // stride_w
+ outProcess.getAZeroPoint(),
+ activations,
+ B_zero_point[0],
+ *(packed_weights.getPackedWFor3DDW()),
+ C_multiplier[0],
+ outProcess.getCZeroPoint(),
+ out,
+ outProcess.getColOffsets(),
+ outProcess.getBias(),
+ outProcess.RELU_FUSED, // fuse_relu
+ thread_id,
+ num_threads);
+ } else if (
+ processOutputType::QGRANType ==
+ QuantizationGranularity::OUT_CHANNEL ||
+ processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+ depthwise_3x3x3_per_channel_quantization_pad_1(
+ conv_p.MB, // mini batch
+ conv_p.IN_DIM[0], // T
+ conv_p.IN_DIM[1], // H
+ conv_p.IN_DIM[2], // W
+ conv_p.OC, // output channels
+ conv_p.stride[0], // stride_t
+ conv_p.stride[1], // stride_h
+ conv_p.stride[2], // stride_w
+ outProcess.getAZeroPoint(),
+ activations,
+ B_zero_point,
+ *(packed_weights.getPackedWFor3DDW()),
+ C_multiplier,
+ outProcess.getCZeroPoint(),
+ out,
+ outProcess.getColOffsets(),
+ outProcess.getBias(),
+ outProcess.RELU_FUSED, // fuse_relu
+ thread_id,
+ num_threads);
+ } else {
+ std::string msg =
+ "[FBGEMM_CONV_ERROR] This quantization granularity is "
+ "not supported";
+ throw std::runtime_error(msg);
+ }
} else {
- depthwise_3x3_pad_1(
- conv_p.MB, // mini batch
- conv_p.IN_DIM[0], // H
- conv_p.IN_DIM[1], // W
- conv_p.OC, // output channels
- conv_p.stride[0], // stride_h
- conv_p.stride[1], // stride_w
- outProcess.getAZeroPoint(),
- activations,
- B_zero_point[0],
- *(packed_weights.getPackedWFor2DDW()),
- C_multiplier[0],
- outProcess.getCZeroPoint(),
- out,
- outProcess.getColOffsets(),
- outProcess.getBias(),
- outProcess.RELU_FUSED, // fuse_relu
- thread_id,
- num_threads);
+ if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+ depthwise_3x3_pad_1(
+ conv_p.MB, // mini batch
+ conv_p.IN_DIM[0], // H
+ conv_p.IN_DIM[1], // W
+ conv_p.OC, // output channels
+ conv_p.stride[0], // stride_h
+ conv_p.stride[1], // stride_w
+ outProcess.getAZeroPoint(),
+ activations,
+ B_zero_point[0],
+ *(packed_weights.getPackedWFor2DDW()),
+ C_multiplier[0],
+ outProcess.getCZeroPoint(),
+ out,
+ outProcess.getColOffsets(),
+ outProcess.getBias(),
+ outProcess.RELU_FUSED, // fuse_relu
+ thread_id,
+ num_threads);
+ } else if (
+ processOutputType::QGRANType ==
+ QuantizationGranularity::OUT_CHANNEL ||
+ processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+ // The number of channels == groups for depthwise convolutions
+ depthwise_3x3_per_channel_quantization_pad_1(
+ conv_p.MB, // mini batch
+ conv_p.IN_DIM[0], // H
+ conv_p.IN_DIM[1], // W
+ conv_p.OC, // output channels
+ conv_p.stride[0], // stride_h
+ conv_p.stride[1], // stride_w
+ outProcess.getAZeroPoint(),
+ activations,
+ B_zero_point,
+ *(packed_weights.getPackedWFor2DDW()),
+ C_multiplier,
+ outProcess.getCZeroPoint(),
+ out,
+ outProcess.getColOffsets(),
+ outProcess.getBias(),
+ outProcess.RELU_FUSED, // fuse_relu
+ thread_id,
+ num_threads);
+ } else {
+ std::string msg =
+ "[FBGEMM_CONV_ERROR] This quantization granularity is "
+ "not supported";
+ throw std::runtime_error(msg);
+ }
}
break;
}
@@ -195,11 +259,32 @@ int fbgemmConv(
// All other convolutions go through im2col-based implementation
// std::cout << "Im2col path" << std::endl;
std::vector<int32_t> row_offset_buf(
- PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>
- ::rowOffsetBufferSize(blocking_params));
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>::rowOffsetBufferSize(
+ blocking_params));
const std::int32_t* b_zero_point = outProcess.getBZeroPoint();
- bool b_symmetric = b_zero_point[0] == 0;
+ bool b_symmetric = false;
+ if (processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
+ b_symmetric = b_zero_point[0] == 0;
+ } else if (
+ processOutputType::QGRANType == QuantizationGranularity::GROUP) {
+ b_symmetric =
+ std::all_of(b_zero_point, b_zero_point + conv_p.G, [](int i) {
+ return i == 0;
+ });
+ } else if (
+ processOutputType::QGRANType ==
+ QuantizationGranularity::OUT_CHANNEL) {
+ b_symmetric =
+ std::all_of(b_zero_point, b_zero_point + conv_p.OC, [](int i) {
+ return i == 0;
+ });
+ } else {
+ std::string msg =
+ "[FBGEMM_CONV_ERROR] This quantization granularity is "
+ "not supported";
+ throw std::runtime_error(msg);
+ }
PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM> packA(
conv_p,
activations,
diff --git a/test/UniConvTest.cc b/test/UniConvTest.cc
index 91bf578..ce26442 100644
--- a/test/UniConvTest.cc
+++ b/test/UniConvTest.cc
@@ -21,6 +21,55 @@
using namespace std;
using namespace fbgemm;
+vector<QuantizationGranularity> qGranularityVals{
+ QuantizationGranularity::TENSOR,
+ QuantizationGranularity::GROUP,
+ QuantizationGranularity::OUT_CHANNEL};
+
+static vector<conv_param_t<>> GetShapes_() {
+ vector<conv_param_t<>> shapes = {
+ // MB, IC, OC, {IH, IW}, G, {KH, KW}, {stride_h, stride_w}, {pad_t, pad_l,
+ // pad_b, pad_r}
+ // Regular
+ conv_param_t<>(1, 16, 16, {10, 30}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 16, 32, {30, 10}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 1}, {2, 1, 2, 1}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 1}, {1, 2, 1, 2}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {1, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 1, {3, 3}, {2, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 16, 16, {10, 30}, 1, {3, 5}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 16, 16, {10, 30}, 1, {5, 3}, {1, 1}, {1, 1, 1, 1}),
+ // groupwise
+ conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 8, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 16, 32, {10, 30}, 8, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {2, 2}, {2, 1, 2, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {1, 2}, {2, 1, 2, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 3}, {2, 1}, {2, 1, 2, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 8, {3, 5}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 8, {5, 3}, {1, 1}, {1, 1, 1, 1}),
+ // DW
+ conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {1, 1}, {1, 2, 1, 2}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {2, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 3}, {1, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 32, {3, 5}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 32, 32, {10, 30}, 32, {5, 3}, {1, 1}, {1, 1, 1, 1}),
+ // Pointwise
+ conv_param_t<>(1, 32, 32, {10, 30}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}),
+ conv_param_t<>(1, 16, 32, {10, 30}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {2, 2}, {0, 0, 0, 0}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {1, 2}, {0, 0, 0, 0}),
+ conv_param_t<>(1, 32, 16, {10, 30}, 1, {1, 1}, {2, 1}, {0, 0, 0, 0}),
+ };
+ return shapes;
+}
+
namespace {
// tuple represents MB, IC, OC, IT, IH, IW, KH/KW, stride, pad
@@ -28,8 +77,12 @@ class uniConvTest
: public testing::TestWithParam<
tuple<int, int, int, int, int, int, int, int, int, int>> {};
+class UniConvQGranTest : public testing::TestWithParam<
+ tuple<QuantizationGranularity, bool, bool>> {};
+
}; // namespace
+// Combine only allows at most 10 generators.
INSTANTIATE_TEST_CASE_P(
InstantiationName,
uniConvTest,
@@ -45,6 +98,13 @@ INSTANTIATE_TEST_CASE_P(
::testing::ValuesIn({1, 2}), // stride
::testing::ValuesIn({0, 1, 2}))); // pad
+INSTANTIATE_TEST_CASE_P(
+ InstantiationName,
+ UniConvQGranTest,
+ ::testing::Combine(
+ ::testing::ValuesIn(qGranularityVals),
+ ::testing::Bool(), // A symmetric
+ ::testing::Bool())); // B symmetric
/**
* Test for conv packing
*/
@@ -323,3 +383,212 @@ TEST(uniConvTest, cornerCases) {
0,
1);
}
+
+/**
+ * @brief Unit test for uint8 activations, int8 weights, and 32-bit
+ * accumulation. Output processing: requantization -> nothing
+ */
+TEST_P(UniConvQGranTest, requantizeTest) {
+ vector<conv_param_t<>> shapes(GetShapes_());
+ QuantizationGranularity q_granularity;
+ bool a_symmetric, b_symmetric;
+ tie(q_granularity, a_symmetric, b_symmetric) = GetParam();
+
+ for (auto conv_p : shapes) {
+ int R = conv_p.K[0];
+ int S = conv_p.K[1];
+ int G = conv_p.G;
+ int OC = conv_p.OC;
+ int OH = conv_p.OUT_DIM[0];
+ int OW = conv_p.OUT_DIM[1];
+ int IC_per_G = conv_p.IC / conv_p.G;
+ int OC_per_G = conv_p.OC / conv_p.G;
+
+ // activations
+ aligned_vector<uint8_t> Aint8(
+ conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
+
+ // weights
+ // The weight matrix is in layout G K/G (R S C/G)
+ aligned_vector<int8_t> Bint8(R * S * conv_p.G * IC_per_G * OC_per_G, 0);
+ aligned_vector<int8_t> Bint8_tr(R * S * G * IC_per_G * OC_per_G, 0);
+
+ aligned_vector<int32_t> Cint32_ref(conv_p.MB * OH * OW * OC, 0);
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
+ aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
+ aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
+
+ randFill<uint8_t>(Aint8, 0, 5);
+ int32_t Aint8_zero_point = a_symmetric ? 0 : 4;
+
+ randFill<int8_t>(Bint8, -4, 4);
+
+ // computing column offset
+ vector<int32_t> col_offsets(G * OC_per_G);
+
+ int ncols_per_quant_group = G * OC_per_G;
+ if (q_granularity == QuantizationGranularity::GROUP) {
+ ncols_per_quant_group = OC_per_G;
+ } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+ ncols_per_quant_group = 1;
+ }
+
+ aligned_vector<int32_t> Bint8_zero_point(
+ G * OC_per_G / ncols_per_quant_group);
+ if (b_symmetric) {
+ randFill(Bint8_zero_point, -3, 3);
+ } else {
+ randFill(Bint8_zero_point, 0, 0);
+ }
+
+ // matrix dimensions after im2col for each GEMM.
+ // For each group, there is one GEMM of the following dimensions
+ int MDim = conv_p.MB * OH * OW;
+ int NDim = OC_per_G;
+ int KDim = R * S * IC_per_G;
+
+ vector<uint8_t> Aint8_im2col(MDim * KDim * G);
+ im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
+
+ vector<int32_t> row_offsets(MDim);
+
+ aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+ randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2);
+ int32_t C_zero_pt = 5;
+
+ // reference implementation
+ // conv_ref expects weights to be in G (R S C/G) K/G
+ int8_t* rightBData = Bint8.data();
+ transposeConvWeights(conv_p, Bint8.data(), Bint8_tr.data());
+ rightBData = Bint8_tr.data();
+ for (int g = 0; g < G; ++g) {
+ col_offsets_with_zero_pt_s8acc32_ref(
+ R * S * IC_per_G,
+ OC_per_G,
+ OC_per_G,
+ rightBData + g * R * S * IC_per_G * OC_per_G,
+ Bint8_zero_point.data() + g * OC_per_G / ncols_per_quant_group,
+ col_offsets.data() + g * OC_per_G,
+ ncols_per_quant_group);
+ }
+ conv_ref(
+ conv_p, Aint8.data(), Aint8_zero_point, rightBData, Cint32_ref.data());
+
+ for (int g = 0; g < G; ++g) {
+ row_offsets_u8acc32_ref(
+ MDim,
+ KDim,
+ KDim * G,
+ Aint8_im2col.data() + g * KDim,
+ row_offsets.data());
+
+ requantize_u8acc32_ref(
+ MDim,
+ NDim,
+ G * NDim,
+ Cint32_ref.data() + g * NDim,
+ Cint8_ref.data() + g * NDim,
+ C_multiplier.data() + g * NDim / ncols_per_quant_group,
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
+ row_offsets.data(),
+ col_offsets.data() + g * NDim,
+ nullptr,
+ ncols_per_quant_group);
+ }
+
+ PackWeightsForConv<2> packedWeights(conv_p, Bint8.data());
+
+ // TODO: Uncomment once we support multiple threads in fbgemmGroupwiseConv
+ // #ifdef _OPENMP
+ // #pragma omp parallel
+ // #endif
+ {
+ vector<int32_t> row_offset_buf(rowOffsetBufferSizeGConv(conv_p));
+
+ DoNothing<> doNothingObj{};
+
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
+
+ if (q_granularity == QuantizationGranularity::TENSOR) {
+ ReQuantizeOutput<false, QuantizationGranularity::TENSOR> reqObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ nullptr, /* row offset buffer */
+ col_offsets.data(),
+ nullptr,
+ G * NDim,
+ G);
+
+ fbgemmConv(
+ conv_p,
+ Aint8.data(),
+ packedWeights,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ reqObj,
+ tid,
+ num_threads);
+
+ } else if (q_granularity == QuantizationGranularity::GROUP) {
+ ReQuantizeOutput<false, QuantizationGranularity::GROUP> reqObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ nullptr, /* row offset buffer */
+ col_offsets.data(),
+ nullptr,
+ G * NDim,
+ G);
+
+ fbgemmConv(
+ conv_p,
+ Aint8.data(),
+ packedWeights,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ reqObj,
+ tid,
+ num_threads);
+
+ } else {
+ ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> reqObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ nullptr, /* row offset buffer */
+ col_offsets.data(),
+ nullptr,
+ G * NDim,
+ G);
+
+ fbgemmConv(
+ conv_p,
+ Aint8.data(),
+ packedWeights,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ reqObj,
+ tid,
+ num_threads);
+ }
+ } // omp parallel
+
+ compare_validate_buffers(
+ Cint8_ref.data(),
+ Cint8_fb.data(),
+ MDim,
+ NDim * G,
+ NDim * G,
+ static_cast<uint8_t>(0));
+ } // for each shape
+}