optimize PackAWithIm2Col for symmetric b quant

Summary: Add additional option b_symmetric and skip row offset computation if it's true Reviewed By: jianyuh Differential Revision: D14119128 fbshipit-source-id: fa079347562b7f75727b3a1414e9bdda3f9c65dd
author: Jongsoo Park <jongsoo@fb.com> 2019-02-20 19:50:37 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-02-20 19:59:51 +0300
commit: 9ebc998c2bf0da81ab7afa1478df395d79429cf8 (patch)
tree: c664731e03c05f239f28bc3eb6a8010b65696f93
parent: a5f1f5308130f21b376f5368e87eb74b071493fa (diff)
3 files changed, 101 insertions, 58 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index 2ac10f7..4f3c92e 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -524,13 +524,15 @@ class FBGEMM_API PackAWithIm2Col
    *                    buffer and owns it. Otherwise, this class doesn't own
    *                    the buffer. The buffer will be populated when pack
    *                    function is called.
+   * @params b_symmetric if true we skip row offset computation
    */
   PackAWithIm2Col(
       const conv_param_t<SPATIAL_DIM>& conv_param,
       const T* sdata,
       inpType* pmat = nullptr,
-      std::int32_t zero_pt = 0,
-      std::int32_t* row_offset = nullptr);
+      std::int32_t a_zero_pt = 0,
+      std::int32_t* row_offset = nullptr,
+      bool b_symmetric = false);
 
   /**
    * Activation matrices are not constant so cannot amortize the cost of
@@ -578,9 +580,9 @@ class FBGEMM_API PackAWithIm2Col
  private:
   const conv_param_t<SPATIAL_DIM> conv_p_;
   const T* sdata_;
-  std::int32_t zero_pt_;
-  std::int32_t* row_offset_;
-  bool rowOffsetAllocatedHere;
+  std::int32_t a_zero_pt_;
+  std::int32_t* row_offset_{nullptr};
+  bool rowOffsetAllocatedHere{false};
   std::int32_t row_interleave_B_;
 };
 
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index f096d7a..b9e310f 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -21,8 +21,9 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
     const conv_param_t<SPATIAL_DIM>& conv_p,
     const T* sdata,
     inpType* pmat,
-    int32_t zero_pt,
-    int32_t* row_offset)
+    int32_t a_zero_pt,
+    int32_t* row_offset,
+    bool b_symmetric)
     : PackMatrix<PackAWithIm2Col<T, accT, SPATIAL_DIM>, T, accT>(
           conv_p.MB *
               std::accumulate(
@@ -40,7 +41,7 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
           conv_p.G),
       conv_p_(conv_p),
       sdata_(sdata),
-      zero_pt_(zero_pt) {
+      a_zero_pt_(a_zero_pt) {
   static_assert(
       SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension ");
   if (cpuinfo_has_x86_avx512f()) {
@@ -70,13 +71,15 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
         fbgemmAlignedAlloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)));
     // aligned_alloc(64, BaseType::brow_ * BaseType::bcol_ * sizeof(T)));
   }
-  if (row_offset) {
-    rowOffsetAllocatedHere = false;
-    row_offset_ = row_offset;
-  } else {
-    rowOffsetAllocatedHere = true;
-    row_offset_ = static_cast<int32_t*>(
-        fbgemmAlignedAlloc(64, BaseType::brow_ * sizeof(int32_t)));
+  if (!b_symmetric) {
+    if (row_offset) {
+      rowOffsetAllocatedHere = false;
+      row_offset_ = row_offset;
+    } else {
+      rowOffsetAllocatedHere = true;
+      row_offset_ = static_cast<int32_t*>(
+          fbgemmAlignedAlloc(64, BaseType::brow_ * sizeof(int32_t)));
+    }
   }
 }
 
@@ -115,20 +118,35 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
       "PackAWithIm2Col<T, accT>::pack only works for T == uint8_t");
   if (point_wise) {
     int32_t ld = this->numCols();
-    for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
-      int buf_idx = i - block.row_start;
-      memcpy(
-          out + buf_idx * BaseType::blockColSize(),
-          sdata_ + i * ld + block.col_start,
-          block.col_size * sizeof(T));
-      // zero fill
-      for (int j = block.col_size; j < block_p.col_size; ++j) {
-        out[buf_idx * BaseType::blockColSize() + j] = 0;
+    if (row_offset_buf) {
+      for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+        int buf_idx = i - block.row_start;
+        memcpy(
+            out + buf_idx * BaseType::blockColSize(),
+            sdata_ + i * ld + block.col_start,
+            block.col_size * sizeof(T));
+        // zero fill
+        for (int j = block.col_size; j < block_p.col_size; ++j) {
+          out[buf_idx * BaseType::blockColSize() + j] = 0;
+        }
+        int32_t row_sum =
+            row_offset_acc ? row_offset_buf[i - block.row_start] : 0;
+        row_sum +=
+            reduceAvx2(sdata_ + i * ld + block.col_start, block.col_size);
+        row_offset_buf[i - block.row_start] = row_sum;
+      }
+    } else {
+      for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
+        int buf_idx = i - block.row_start;
+        memcpy(
+            out + buf_idx * BaseType::blockColSize(),
+            sdata_ + i * ld + block.col_start,
+            block.col_size * sizeof(T));
+        // zero fill
+        for (int j = block.col_size; j < block_p.col_size; ++j) {
+          out[buf_idx * BaseType::blockColSize() + j] = 0;
+        }
       }
-      int32_t row_sum =
-          row_offset_acc ? row_offset_buf[i - block.row_start] : 0;
-      row_sum += reduceAvx2(sdata_ + i * ld + block.col_start, block.col_size);
-      row_offset_buf[i - block.row_start] = row_sum;
     }
 
     return;
@@ -168,7 +186,7 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
           std::memset(
               out + (i - block.row_start) * BaseType::blockColSize() +
                   (j_blk_start - block.col_start),
-              zero_pt_,
+              a_zero_pt_,
               sizeof(T) * (j_blk_end - j_blk_start));
         } else {
           std::memcpy(
@@ -220,7 +238,7 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
               &out
                   [(i - block.row_start) * BaseType::blockColSize() +
                    (j_blk_start - block.col_start)],
-              zero_pt_,
+              a_zero_pt_,
               sizeof(T) * (j_blk_end - j_blk_start));
         } else {
           std::memcpy(
@@ -252,11 +270,13 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
                (block.col_start + block.col_size)));
     }
 
-    // TODO: skip row_offset computation when B_zero_point is 0
-    int32_t row_sum = row_offset_acc ? row_offset_buf[i - block.row_start] : 0;
-    row_sum += reduceAvx2(
-        out + (i - block.row_start) * this->blockColSize(), block.col_size);
-    row_offset_buf[i - block.row_start] = row_sum;
+    if (row_offset_buf) {
+      int32_t row_sum =
+          row_offset_acc ? row_offset_buf[i - block.row_start] : 0;
+      row_sum += reduceAvx2(
+          out + (i - block.row_start) * this->blockColSize(), block.col_size);
+      row_offset_buf[i - block.row_start] = row_sum;
+    }
   } // for each i
 }
 
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index feae002..1dfc756 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -30,13 +30,15 @@ vector<QuantizationGranularity> qGranularityVals{
 
 namespace {
 class fbgemmIm2colTest
-    : public testing::TestWithParam<QuantizationGranularity> {};
+    : public testing::TestWithParam<tuple<QuantizationGranularity, bool>> {};
 }; // namespace
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
     fbgemmIm2colTest,
-    ::testing::ValuesIn(qGranularityVals));
+    ::testing::Combine(
+        ::testing::ValuesIn(qGranularityVals),
+        ::testing::Bool()));
 
 // From Faster-RCNN with ShuffleNet
 static vector<conv_param_t<>> shapes = {
@@ -52,7 +54,7 @@ static vector<conv_param_t<>> shapes = {
 };
 
 template <typename ACC_T, QuantizationGranularity Q_GRAN>
-static void Im2colTest() {
+static void Im2colTest(bool b_symmetric) {
   for (auto conv_p : shapes) {
     for (int groups : {1, 4}) {
       if (conv_p.IC % groups != 0 || conv_p.OC % groups != 0) {
@@ -89,6 +91,9 @@ static void Im2colTest() {
         randFill<int8_t>(Bint8, -4, 4);
         randFill(Bint8_zero_point, -3, -1);
       }
+      if (b_symmetric) {
+        randFill(Bint8_zero_point, 0, 0);
+      }
 
       aligned_vector<float> C_multiplier(Bint8_zero_point.size());
       randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
@@ -169,7 +174,8 @@ static void Im2colTest() {
             Aint8.data(),
             nullptr,
             Aint8_zero_point,
-            row_offset_buf.data());
+            row_offset_buf.data(),
+            b_symmetric);
 
         DoNothing<> doNothingObj{};
         ReQuantizeOutput<false, Q_GRAN> outputProcObj(
@@ -223,24 +229,28 @@ static void Im2colTest() {
 }
 
 TEST_P(fbgemmIm2colTest, Acc32Test) {
-  QuantizationGranularity q_granularity = GetParam();
+  QuantizationGranularity q_granularity;
+  bool b_symmetric;
+  tie(q_granularity, b_symmetric) = GetParam();
   if (q_granularity == QuantizationGranularity::TENSOR) {
-    Im2colTest<int32_t, QuantizationGranularity::TENSOR>();
+    Im2colTest<int32_t, QuantizationGranularity::TENSOR>(b_symmetric);
   } else if (q_granularity == QuantizationGranularity::GROUP) {
-    Im2colTest<int32_t, QuantizationGranularity::GROUP>();
+    Im2colTest<int32_t, QuantizationGranularity::GROUP>(b_symmetric);
   } else {
-    Im2colTest<int32_t, QuantizationGranularity::OUT_CHANNEL>();
+    Im2colTest<int32_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric);
   }
 }
 
 TEST_P(fbgemmIm2colTest, Acc16Test) {
-  QuantizationGranularity q_granularity = GetParam();
+  QuantizationGranularity q_granularity;
+  bool b_symmetric;
+  tie(q_granularity, b_symmetric) = GetParam();
   if (q_granularity == QuantizationGranularity::TENSOR) {
-    Im2colTest<int16_t, QuantizationGranularity::TENSOR>();
+    Im2colTest<int16_t, QuantizationGranularity::TENSOR>(b_symmetric);
   } else if (q_granularity == QuantizationGranularity::GROUP) {
-    Im2colTest<int16_t, QuantizationGranularity::GROUP>();
+    Im2colTest<int16_t, QuantizationGranularity::GROUP>(b_symmetric);
   } else {
-    Im2colTest<int16_t, QuantizationGranularity::OUT_CHANNEL>();
+    Im2colTest<int16_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric);
   }
 }
 
@@ -453,7 +463,10 @@ void SConvTest() {
 }
 
 TEST_P(fbgemmIm2colTest, SConvTest) {
-  QuantizationGranularity q_granularity = GetParam();
+  QuantizationGranularity q_granularity;
+  bool b_symmetric;
+  tie(q_granularity, b_symmetric) = GetParam();
+  // b_symmetric ignored for now
   if (q_granularity == QuantizationGranularity::TENSOR) {
     SConvTest<QuantizationGranularity::TENSOR>();
   } else if (q_granularity == QuantizationGranularity::GROUP) {
@@ -539,7 +552,7 @@ static vector<conv_param_t<3>> shapes_3d = {
 };
 
 template <typename ACC_T, QuantizationGranularity Q_GRAN>
-static void Im2col3DTest() {
+static void Im2col3DTest(bool b_symmetric) {
   for (auto conv_p : shapes_3d) {
     for (int groups : {1, 4}) {
       if (conv_p.IC % groups != 0 || conv_p.OC % groups != 0) {
@@ -578,6 +591,9 @@ static void Im2col3DTest() {
         randFill<int8_t>(Bint8, -4, 4);
         randFill(Bint8_zero_point, -3, -1);
       }
+      if (b_symmetric) {
+        randFill(Bint8_zero_point, 0, 0);
+      }
 
       aligned_vector<float> C_multiplier(Bint8_zero_point.size());
       randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
@@ -659,7 +675,8 @@ static void Im2col3DTest() {
             Aint8.data(),
             nullptr,
             Aint8_zero_point,
-            row_offset_buf.data());
+            row_offset_buf.data(),
+            b_symmetric);
 
         DoNothing<> doNothingObj{};
         ReQuantizeOutput<false, Q_GRAN> outputProcObj(
@@ -719,23 +736,27 @@ static void Im2col3DTest() {
 }
 
 TEST_P(fbgemmIm2colTest, 3DAcc32Test) {
-  QuantizationGranularity q_granularity = GetParam();
+  QuantizationGranularity q_granularity;
+  bool b_symmetric;
+  tie(q_granularity, b_symmetric) = GetParam();
   if (q_granularity == QuantizationGranularity::TENSOR) {
-    Im2col3DTest<int32_t, QuantizationGranularity::TENSOR>();
+    Im2col3DTest<int32_t, QuantizationGranularity::TENSOR>(b_symmetric);
   } else if (q_granularity == QuantizationGranularity::GROUP) {
-    Im2col3DTest<int32_t, QuantizationGranularity::GROUP>();
+    Im2col3DTest<int32_t, QuantizationGranularity::GROUP>(b_symmetric);
   } else {
-    Im2col3DTest<int32_t, QuantizationGranularity::OUT_CHANNEL>();
+    Im2col3DTest<int32_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric);
   }
 }
 
 TEST_P(fbgemmIm2colTest, 3DAcc16Test) {
-  QuantizationGranularity q_granularity = GetParam();
+  QuantizationGranularity q_granularity;
+  bool b_symmetric;
+  tie(q_granularity, b_symmetric) = GetParam();
   if (q_granularity == QuantizationGranularity::TENSOR) {
-    Im2col3DTest<int16_t, QuantizationGranularity::TENSOR>();
+    Im2col3DTest<int16_t, QuantizationGranularity::TENSOR>(b_symmetric);
   } else if (q_granularity == QuantizationGranularity::GROUP) {
-    Im2col3DTest<int16_t, QuantizationGranularity::GROUP>();
+    Im2col3DTest<int16_t, QuantizationGranularity::GROUP>(b_symmetric);
   } else {
-    Im2col3DTest<int16_t, QuantizationGranularity::OUT_CHANNEL>();
+    Im2col3DTest<int16_t, QuantizationGranularity::OUT_CHANNEL>(b_symmetric);
   }
 }
author	Jongsoo Park <jongsoo@fb.com>	2019-02-20 19:50:37 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-02-20 19:59:51 +0300
commit	9ebc998c2bf0da81ab7afa1478df395d79429cf8 (patch)
tree	c664731e03c05f239f28bc3eb6a8010b65696f93
parent	a5f1f5308130f21b376f5368e87eb74b071493fa (diff)