Merge remote-tracking branch 'upstream/master' into youki/win-jit-debug-int8

Fix for windows build errors
author: Young Jin Kim <youki@microsoft.com> 2019-09-25 19:40:48 +0300
committer: Young Jin Kim <youki@microsoft.com> 2019-09-25 19:40:48 +0300
commit: 7bd598c9e97871e42c19449fddf7bd317898eb58 (patch)
tree: a3b1fea18477b63add473037d0644a96b115e0da /bench
parent: 08763b198ef743741560ae42a9c10a3017c7c9ce (diff)
parent: 518d8a1832cf1eb1dda2feace1a278e9e4f302ba (diff)
8 files changed, 205 insertions, 125 deletions
diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc
index b450beb..35a3b2a 100644
--- a/bench/ConvUnifiedBenchmark.cc
+++ b/bench/ConvUnifiedBenchmark.cc
@@ -24,33 +24,39 @@
 using namespace std;
 using namespace fbgemm;
 
+// clang-format off
 // 2D conv shapes
 vector<conv_param_t<2>> shapes_2d = {
-  // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
-  // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right
-  // 2D convolutions
-  // regular
-  conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-  // groupwise
-  conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-  // DW
-  conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-  // Pointwise
-  conv_param_t<>(1, 128, 128, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})
+    // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
+    // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right
+    // 2D convolutions
+    // regular
+    conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // regular with dilation
+    conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}, {2, 2}),
+    // groupwise
+    conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // DW
+    conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+    // Pointwise
+    conv_param_t<>(1, 128, 128, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})
 
 };
 
 // 3D conv shapes
 vector<conv_param_t<3>> shapes_3d = {
-  // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h, stride_w},
-  // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right}
-  // Regular
-  conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
-  // Depthwise
-  conv_param_t<3>(1, 64, 64, {8, 14, 14}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
-  // Pointwise
-  conv_param_t<3>(1, 128, 128, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0})
-};
+    // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h,
+    // stride_w},
+    // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right}
+    // Regular
+    conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
+    //With dilations
+    conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}, {2, 2, 2}),
+    // Depthwise
+    conv_param_t<3>(1, 64, 64, {8, 14, 14}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
+    // Pointwise
+    conv_param_t<3>(1, 128, 128, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0})};
+// clang-format on
 
 template <int SPATIAL_DIM, typename Acc_t>
 void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
@@ -81,6 +87,10 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
     header += "pad_t, ";
   }
   header += "pad_h, pad_w, ";
+  if (SPATIAL_DIM == 3) {
+    header += "dilation_t, ";
+  }
+  header += "dilation_h, dilation_w, ";
 
   header += "Type, M, N, K, ";
 
@@ -278,7 +288,9 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
     for (int i = 0; i < SPATIAL_DIM; ++i) {
       cout << conv_p.pad[i] << ", ";
     }
-
+    for (int i = 0; i < SPATIAL_DIM; ++i) {
+      cout << conv_p.dilation[i] << ", ";
+    }
     cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5)
          << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6)
          << KDim << ", ";
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index 0efdcac..ff2be6f 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -4,7 +4,6 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#include "test/I8DepthwiseTest.h"
 
 #include <algorithm>
 #include <chrono>
@@ -19,8 +18,8 @@
 
 #include "AlignedVec.h"
 #include "BenchUtils.h"
-#include "fbgemm/Utils.h"
 #include "fbgemm/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/Utils.h"
 #include "src/RefImplementations.h"
 
 using namespace std;
@@ -35,6 +34,34 @@ int main() {
   }
 #endif
 
+  // From ResNeXt-3D-101
+  // clang-format off
+  vector<vector<int>> shapes_3d = {
+    // NOTE: clang-format wants to use a different formatting but the current
+    // formatting should be easier to read.
+    // N, K, T_in, H_in, W_in, stride
+    {   1,  64,   32,  56, 56, 1, },
+    {   1, 128,   16,  28, 28, 1, },
+    {   1, 256,    8,  14, 14, 1, },
+    {   1, 512,    4,   7,  7, 1, },
+
+    {   1, 128,   32,  56, 56, 2, },
+    {   1, 256,   16,  28, 28, 2, },
+    {   1, 512,    8,  14, 14, 2, },
+
+    {   5,  64,   32,  56, 56, 1, },
+    {   5, 128,   16,  28, 28, 1, },
+    {   5, 256,    8,  14, 14, 1, },
+    {   5, 512,    4,   7,  7, 1, },
+
+    {   5, 128,   32,  56, 56, 2, },
+    {   5, 256,   16,  28, 28, 2, },
+    {   5, 512,    8,  14, 14, 2, },
+
+    {   1,   8,    4,   4,  4, 1, },
+  };
+  // clang-format on
+
   // Depthwise is memory BW bound so we want to flush LLC.
   bool flush = true;
   std::vector<char> llc;
@@ -61,14 +88,28 @@ int main() {
     constexpr int K_T = 3, K_H = 3, K_W = 3;
     constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1,
                   PAD_R = 1;
-    int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1;
-    int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1;
-    int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
+
+    conv_param_t<3> conv_p(
+        N,
+        K,
+        K,
+        {T, H, W},
+        K,
+        {K_T, K_H, K_W},
+        {stride_t, stride_h, stride_w},
+        {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R});
+    int T_OUT = conv_p.OUT_DIM[0];
+    int H_OUT = conv_p.OUT_DIM[1];
+    int W_OUT = conv_p.OUT_DIM[2];
+
+    int MDim = N * T_OUT * H_OUT * W_OUT;
+    int KDim = K_T * K_H * K_W * K;
+    int KDimPerGroup = KDim / conv_p.G;
 
     aligned_vector<uint8_t> A(N * T * H * W * K);
-    aligned_vector<int8_t> B(K * K_T * K_H * K_W);
-    aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
-        C(C_ref.size());
+    aligned_vector<int8_t> B(KDim);
+    aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
+    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
 
     randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
@@ -76,52 +117,49 @@ int main() {
     randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
-    depthwise_3x3x3_pad_1_ref(
-        N,
-        T,
-        H,
-        W,
-        K,
-        stride_t,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B.data(),
-        C_ref.data());
-
-    int32_t minimum = *min_element(C_ref.begin(), C_ref.end());
-    int32_t maximum = *max_element(C_ref.begin(), C_ref.end());
+    aligned_vector<float> C_multiplier(1);
+    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
+    int32_t C_zero_point = 5;
 
-    float C_multiplier = 255. / (maximum - minimum);
+    vector<int32_t> row_offsets(MDim);
+    // im2col to compute row offset later
+    vector<uint8_t> A_im2col(MDim * KDim);
+    im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
 
     aligned_vector<int32_t> col_offsets(K);
     aligned_vector<int32_t> bias(K);
     randFill(col_offsets, -100, 100);
     randFill(bias, -40, 40);
-    int32_t C_zero_point = 5;
 
-    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
-    depthwise_3x3x3_pad_1_ref(
-        N,
-        T,
-        H,
-        W,
-        K,
-        stride_t,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B_zero_point,
-        B.data(),
-        C_multiplier,
-        C_zero_point,
-        C_uint8_ref.data(),
-        col_offsets.data(),
-        bias.data());
-
-    Packed3x3x3ConvMatrix Bp(K, B.data());
+    conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
+
+    for (int g = 0; g < conv_p.G; ++g) {
+      // Compute row offset
+      row_offsets_u8acc32_ref(
+          MDim,
+          KDimPerGroup,
+          KDim,
+          A_im2col.data() + g * KDimPerGroup,
+          row_offsets.data());
+
+      // Requantization
+      requantize_u8acc32_ref(
+          MDim,
+          1,
+          conv_p.G,
+          C_ref.data() + g,
+          C_uint8_ref.data() + g,
+          C_multiplier.data(),
+          C_zero_point,
+          A_zero_point,
+          &B_zero_point,
+          row_offsets.data(),
+          col_offsets.data() + g,
+          bias.data() + g,
+          K);
+    }
+
+    PackedDepthWiseConvMatrix Bp(K, 3 * 3 * 3, B.data());
 
     double ttot = 0;
     double bytes = double(NITER) *
@@ -153,7 +191,7 @@ int main() {
             A.data(),
             B_zero_point,
             Bp,
-            C_multiplier,
+            C_multiplier[0],
             C_zero_point,
             C_uint8.data(),
             col_offsets.data(),
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index 96921a1..6c2ee17 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -17,8 +17,8 @@
 
 #include "AlignedVec.h"
 #include "BenchUtils.h"
-#include "fbgemm/Utils.h"
 #include "fbgemm/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/Utils.h"
 #include "src/RefImplementations.h"
 
 using namespace std;
@@ -34,10 +34,11 @@ int main() {
 #endif
 
   // From Xray OCR
+  // clang-format off
   vector<vector<int>> shapes = {
     // NOTE: clang-format wants to use a different formatting but the current
     // formatting should be easier to read.
-    // N,  G, H_in, W_in, stride
+    // N,  K, H_in, W_in, stride
     {   1,  272,  47, 125, 1, },
     {   1,  272,  64, 125, 1, },
     {   1,  272,  66, 125, 1, },
@@ -138,6 +139,7 @@ int main() {
     {  96,  544,  14,  14, 2, },
     { 100,  544,  14,  14, 2, },
   };
+  // clang-format on
 
   // Depthwise is memory BW bound so we want to flush LLC.
   bool flush = true;
@@ -155,19 +157,35 @@ int main() {
 
   for (auto shape : shapes) {
     int N = shape[0];
-    int G = shape[1];
+    int K = shape[1];
     int H = shape[2];
     int W = shape[3];
     int stride_h = shape[4];
     int stride_w = stride_h;
     constexpr int R = 3, S = 3;
-    constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
-    int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
-    int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
+    int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2,
+        PAD_R = (S - 1) / 2;
+
+    conv_param_t<2> conv_p(
+        N,
+        K,
+        K,
+        {H, W},
+        K,
+        {R, S},
+        {stride_h, stride_w},
+        {PAD_T, PAD_L, PAD_B, PAD_R});
+    int H_OUT = conv_p.OUT_DIM[0];
+    int W_OUT = conv_p.OUT_DIM[1];
+
+    int MDim = N * H_OUT * W_OUT;
+    int KDim = R * S * K;
+    int KDimPerGroup = KDim / conv_p.G;
 
-    aligned_vector<uint8_t> A(N * H * W * G);
-    aligned_vector<int8_t> B(G * R * S);
-    aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size());
+    aligned_vector<uint8_t> A(N * H * W * K);
+    aligned_vector<int8_t> B(KDim);
+    aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
+    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
 
     randFill<uint8_t>(A, 0, 86);
     int32_t A_zero_point = 43;
@@ -175,53 +193,54 @@ int main() {
     randFill<int8_t>(B, -16, 16);
     int32_t B_zero_point = 5;
 
-    depthwise_3x3_pad_1_ref(
-        N,
-        H,
-        W,
-        G,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B.data(),
-        C_ref.data());
-
-    int32_t minimum = *min_element(C_ref.begin(), C_ref.end());
-    int32_t maximum = *max_element(C_ref.begin(), C_ref.end());
+    aligned_vector<float> C_multiplier(1);
+    randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
+    int32_t C_zero_point = 5;
 
-    float C_multiplier = 255. / (maximum - minimum);
+    vector<int32_t> row_offsets(MDim);
+    // im2col to compute row offset later
+    vector<uint8_t> A_im2col(MDim * KDim);
+    im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
 
-    aligned_vector<int32_t> col_offsets(G);
-    aligned_vector<int32_t> bias(G);
+    aligned_vector<int32_t> col_offsets(K);
+    aligned_vector<int32_t> bias(K);
     randFill(col_offsets, -100, 100);
     randFill(bias, -40, 40);
-    int32_t C_zero_point = 5;
 
-    aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
-    depthwise_3x3_pad_1_ref(
-        N,
-        H,
-        W,
-        G,
-        stride_h,
-        stride_w,
-        A_zero_point,
-        A.data(),
-        B_zero_point,
-        B.data(),
-        C_multiplier,
-        C_zero_point,
-        C_uint8_ref.data(),
-        col_offsets.data(),
-        bias.data());
+    conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
+
+    for (int g = 0; g < conv_p.G; ++g) {
+      // Compute row offset
+      row_offsets_u8acc32_ref(
+          MDim,
+          KDimPerGroup,
+          KDim,
+          A_im2col.data() + g * KDimPerGroup,
+          row_offsets.data());
+
+      // Requantization
+      requantize_u8acc32_ref(
+          MDim,
+          1,
+          conv_p.G,
+          C_ref.data() + g,
+          C_uint8_ref.data() + g,
+          C_multiplier.data(),
+          C_zero_point,
+          A_zero_point,
+          &B_zero_point,
+          row_offsets.data(),
+          col_offsets.data() + g,
+          bias.data() + g,
+          K);
+    }
 
-    Packed3x3ConvMatrix Bp(G, B.data());
+    PackedDepthWiseConvMatrix Bp(K, 3 * 3, B.data());
 
     double ttot = 0;
     double bytes = double(NITER) *
-        (G * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S));
-    double ops = double(NITER) * N * H_OUT * W_OUT * G * R * S * 2;
+        (K * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S));
+    double ops = double(NITER) * N * H_OUT * W_OUT * K * R * S * 2;
     chrono::time_point<chrono::system_clock> t_begin, t_end;
     for (int i = 0; i < NWARMUP + NITER; ++i) {
       llc_flush();
@@ -235,19 +254,20 @@ int main() {
             N,
             H,
             W,
-            G,
+            K,
             stride_h,
             stride_w,
             A_zero_point,
             A.data(),
             B_zero_point,
             Bp,
-            C_multiplier,
+            C_multiplier[0],
             C_zero_point,
             C_uint8.data(),
             col_offsets.data(),
             bias.data(),
             false, /* fuse_relu */
+            1.0f, /* act_scale * w_scale */
             tid,
             num_threads);
       }
@@ -262,10 +282,10 @@ int main() {
     for (int n = 0; n < N; ++n) {
       for (int h = 0; h < H_OUT; ++h) {
         for (int w = 0; w < W_OUT; ++w) {
-          for (int g = 0; g < G; ++g) {
+          for (int g = 0; g < K; ++g) {
             uint8_t expected =
-                C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * G + g];
-            uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * G + g];
+                C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * K + g];
+            uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * K + g];
             if (expected != actual) {
               cerr << "Depthwise 3x3 results differ at (" << n << ", " << h
                    << ", " << w << ", " << g << "). expected " << (int)expected
@@ -280,9 +300,9 @@ int main() {
 
     // Report performance
     printf(
-        "N = %d G = %d H = %d W = %d stride = %d with requantization fused\n",
+        "N = %d K = %d H = %d W = %d stride = %d with requantization fused\n",
         N,
-        G,
+        K,
         H,
         W,
         stride_h);
diff --git a/bench/GEMMsBenchmark.cc b/bench/GEMMsBenchmark.cc
index b404d8b..f493a96 100644
--- a/bench/GEMMsBenchmark.cc
+++ b/bench/GEMMsBenchmark.cc
@@ -28,6 +28,7 @@ using namespace std;
 using namespace fbgemm;
 
 void performance_test() {
+  // clang-format off
   static const vector<vector<int>> shapes = {
     // NOTE: clang-format wants to use a different formatting but the current
     // formatting should be easier to read.
@@ -39,6 +40,7 @@ void performance_test() {
     {256, 512, 256},
     {1024, 1024, 1024},
   };
+  // clang-format on
   bool flush = true;
   std::vector<char> llc;
 
diff --git a/bench/GEMMsTunableBenchmark.cc b/bench/GEMMsTunableBenchmark.cc
index a65b51f..2adc556 100644
--- a/bench/GEMMsTunableBenchmark.cc
+++ b/bench/GEMMsTunableBenchmark.cc
@@ -218,7 +218,8 @@ int main(int /* unused */, char** /* unused */) {
   }
 #endif
 
- vector<vector<int>> shapes = {
+  // clang-format off
+  vector<vector<int>> shapes = {
     // NOTE: clang-format wants to use a different formatting but the current
     // formatting should be easier to read.
     // m, n, k
@@ -266,7 +267,8 @@ int main(int /* unused */, char** /* unused */) {
     {128, 128, 128},
     {256, 512, 256},
     {1024, 1024, 1024},
-};
+  };
+  // clang-format on
 
   vector<int> MCBs;
   vector<int> NCBs;
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
index 66ca67e..dcea65c 100644
--- a/bench/PackedFloatInOutBenchmark.cc
+++ b/bench/PackedFloatInOutBenchmark.cc
@@ -28,6 +28,7 @@ using namespace std;
 using namespace fbgemm;
 
 void performance_test() {
+  // clang-format off
   vector<vector<int>> shapes = {
     // NOTE: clang-format wants to use a different formatting but the current
     // formatting should be easier to read.
@@ -66,6 +67,7 @@ void performance_test() {
     {1, 128, 2722},
     {16, 256, 512},
   };
+  // clang-format on
   bool flush = true;
   std::vector<char> llc;
 
diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
index 40ff662..c6e2869 100644
--- a/bench/PackedRequantizeAcc16Benchmark.cc
+++ b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -37,6 +37,7 @@ enum class BenchmarkType {
 };
 
 void performance_test() {
+  // clang-format off
   vector<vector<int>> shapes = {
     // NOTE: clang-format wants to use a different formatting but the current
     // formatting should be easier to read.
@@ -67,6 +68,7 @@ void performance_test() {
     {392, 2048, 512},
     {392, 512, 2048},
   };
+  // clang-format on
   bool flush = true;
   std::vector<char> llc;
 
diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc
index 2f04795..a61ef5a 100644
--- a/bench/PackedRequantizeAcc32Benchmark.cc
+++ b/bench/PackedRequantizeAcc32Benchmark.cc
@@ -28,6 +28,7 @@ using namespace std;
 using namespace fbgemm;
 
 void performance_test() {
+  // clang-format off
   vector<vector<int>> shapes = {
     // NOTE: clang-format wants to use a different formatting but the current
     // formatting should be easier to read.
@@ -70,6 +71,7 @@ void performance_test() {
     {1, 128, 2722},
     {16, 256, 512},
   };
+  // clang-format on
   bool flush = true;
   std::vector<char> llc;
author	Young Jin Kim <youki@microsoft.com>	2019-09-25 19:40:48 +0300
committer	Young Jin Kim <youki@microsoft.com>	2019-09-25 19:40:48 +0300
commit	7bd598c9e97871e42c19449fddf7bd317898eb58 (patch)
tree	a3b1fea18477b63add473037d0644a96b115e0da /bench
parent	08763b198ef743741560ae42a9c10a3017c7c9ce (diff)
parent	518d8a1832cf1eb1dda2feace1a278e9e4f302ba (diff)