diff options
author | Young Jin Kim <youki@microsoft.com> | 2019-09-25 19:40:48 +0300 |
---|---|---|
committer | Young Jin Kim <youki@microsoft.com> | 2019-09-25 19:40:48 +0300 |
commit | 7bd598c9e97871e42c19449fddf7bd317898eb58 (patch) | |
tree | a3b1fea18477b63add473037d0644a96b115e0da /bench | |
parent | 08763b198ef743741560ae42a9c10a3017c7c9ce (diff) | |
parent | 518d8a1832cf1eb1dda2feace1a278e9e4f302ba (diff) |
Merge remote-tracking branch 'upstream/master' into youki/win-jit-debug-int8
Fix for windows build errors
Diffstat (limited to 'bench')
-rw-r--r-- | bench/ConvUnifiedBenchmark.cc | 54 | ||||
-rw-r--r-- | bench/Depthwise3DBenchmark.cc | 134 | ||||
-rw-r--r-- | bench/DepthwiseBenchmark.cc | 128 | ||||
-rw-r--r-- | bench/GEMMsBenchmark.cc | 2 | ||||
-rw-r--r-- | bench/GEMMsTunableBenchmark.cc | 6 | ||||
-rw-r--r-- | bench/PackedFloatInOutBenchmark.cc | 2 | ||||
-rw-r--r-- | bench/PackedRequantizeAcc16Benchmark.cc | 2 | ||||
-rw-r--r-- | bench/PackedRequantizeAcc32Benchmark.cc | 2 |
8 files changed, 205 insertions, 125 deletions
diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc index b450beb..35a3b2a 100644 --- a/bench/ConvUnifiedBenchmark.cc +++ b/bench/ConvUnifiedBenchmark.cc @@ -24,33 +24,39 @@ using namespace std; using namespace fbgemm; +// clang-format off // 2D conv shapes vector<conv_param_t<2>> shapes_2d = { - // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, - // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right - // 2D convolutions - // regular - conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - // groupwise - conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), - // DW - conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1}), - // Pointwise - conv_param_t<>(1, 128, 128, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}) + // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, + // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right + // 2D convolutions + // regular + conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // regular with dilation + conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}, {2, 2}), + // groupwise + conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // DW + conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // Pointwise + conv_param_t<>(1, 128, 128, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}) }; // 3D conv shapes vector<conv_param_t<3>> shapes_3d = { - // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h, stride_w}, - // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right} - // Regular - conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}), - // Depthwise - conv_param_t<3>(1, 64, 64, {8, 14, 14}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}), - // Pointwise - conv_param_t<3>(1, 128, 128, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0}) -}; + // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h, + // stride_w}, + // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right} + // Regular + conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}), + //With dilations + conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}, {2, 2, 2}), + // Depthwise + conv_param_t<3>(1, 64, 64, {8, 14, 14}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}), + // Pointwise + conv_param_t<3>(1, 128, 128, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0})}; +// clang-format on template <int SPATIAL_DIM, typename Acc_t> void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) { @@ -81,6 +87,10 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) { header += "pad_t, "; } header += "pad_h, pad_w, "; + if (SPATIAL_DIM == 3) { + header += "dilation_t, "; + } + header += "dilation_h, dilation_w, "; header += "Type, M, N, K, "; @@ -278,7 +288,9 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) { for (int i = 0; i < SPATIAL_DIM; ++i) { cout << conv_p.pad[i] << ", "; } - + for (int i = 0; i < SPATIAL_DIM; ++i) { + cout << conv_p.dilation[i] << ", "; + } cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5) << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6) << KDim << ", "; diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc index 0efdcac..ff2be6f 100644 --- a/bench/Depthwise3DBenchmark.cc +++ b/bench/Depthwise3DBenchmark.cc @@ -4,7 +4,6 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ -#include "test/I8DepthwiseTest.h" #include <algorithm> #include <chrono> @@ -19,8 +18,8 @@ #include "AlignedVec.h" #include "BenchUtils.h" -#include "fbgemm/Utils.h" #include "fbgemm/FbgemmI8DepthwiseAvx2.h" +#include "fbgemm/Utils.h" #include "src/RefImplementations.h" using namespace std; @@ -35,6 +34,34 @@ int main() { } #endif + // From ResNeXt-3D-101 + // clang-format off + vector<vector<int>> shapes_3d = { + // NOTE: clang-format wants to use a different formatting but the current + // formatting should be easier to read. + // N, K, T_in, H_in, W_in, stride + { 1, 64, 32, 56, 56, 1, }, + { 1, 128, 16, 28, 28, 1, }, + { 1, 256, 8, 14, 14, 1, }, + { 1, 512, 4, 7, 7, 1, }, + + { 1, 128, 32, 56, 56, 2, }, + { 1, 256, 16, 28, 28, 2, }, + { 1, 512, 8, 14, 14, 2, }, + + { 5, 64, 32, 56, 56, 1, }, + { 5, 128, 16, 28, 28, 1, }, + { 5, 256, 8, 14, 14, 1, }, + { 5, 512, 4, 7, 7, 1, }, + + { 5, 128, 32, 56, 56, 2, }, + { 5, 256, 16, 28, 28, 2, }, + { 5, 512, 8, 14, 14, 2, }, + + { 1, 8, 4, 4, 4, 1, }, + }; + // clang-format on + // Depthwise is memory BW bound so we want to flush LLC. bool flush = true; std::vector<char> llc; @@ -61,14 +88,28 @@ int main() { constexpr int K_T = 3, K_H = 3, K_W = 3; constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1; - int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; + + conv_param_t<3> conv_p( + N, + K, + K, + {T, H, W}, + K, + {K_T, K_H, K_W}, + {stride_t, stride_h, stride_w}, + {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R}); + int T_OUT = conv_p.OUT_DIM[0]; + int H_OUT = conv_p.OUT_DIM[1]; + int W_OUT = conv_p.OUT_DIM[2]; + + int MDim = N * T_OUT * H_OUT * W_OUT; + int KDim = K_T * K_H * K_W * K; + int KDimPerGroup = KDim / conv_p.G; aligned_vector<uint8_t> A(N * T * H * W * K); - aligned_vector<int8_t> B(K * K_T * K_H * K_W); - aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K), - C(C_ref.size()); + aligned_vector<int8_t> B(KDim); + aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size()); + aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; @@ -76,52 +117,49 @@ int main() { randFill<int8_t>(B, -16, 16); int32_t B_zero_point = 5; - depthwise_3x3x3_pad_1_ref( - N, - T, - H, - W, - K, - stride_t, - stride_h, - stride_w, - A_zero_point, - A.data(), - B.data(), - C_ref.data()); - - int32_t minimum = *min_element(C_ref.begin(), C_ref.end()); - int32_t maximum = *max_element(C_ref.begin(), C_ref.end()); + aligned_vector<float> C_multiplier(1); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); + int32_t C_zero_point = 5; - float C_multiplier = 255. / (maximum - minimum); + vector<int32_t> row_offsets(MDim); + // im2col to compute row offset later + vector<uint8_t> A_im2col(MDim * KDim); + im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); aligned_vector<int32_t> col_offsets(K); aligned_vector<int32_t> bias(K); randFill(col_offsets, -100, 100); randFill(bias, -40, 40); - int32_t C_zero_point = 5; - aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); - depthwise_3x3x3_pad_1_ref( - N, - T, - H, - W, - K, - stride_t, - stride_h, - stride_w, - A_zero_point, - A.data(), - B_zero_point, - B.data(), - C_multiplier, - C_zero_point, - C_uint8_ref.data(), - col_offsets.data(), - bias.data()); - - Packed3x3x3ConvMatrix Bp(K, B.data()); + conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data()); + + for (int g = 0; g < conv_p.G; ++g) { + // Compute row offset + row_offsets_u8acc32_ref( + MDim, + KDimPerGroup, + KDim, + A_im2col.data() + g * KDimPerGroup, + row_offsets.data()); + + // Requantization + requantize_u8acc32_ref( + MDim, + 1, + conv_p.G, + C_ref.data() + g, + C_uint8_ref.data() + g, + C_multiplier.data(), + C_zero_point, + A_zero_point, + &B_zero_point, + row_offsets.data(), + col_offsets.data() + g, + bias.data() + g, + K); + } + + PackedDepthWiseConvMatrix Bp(K, 3 * 3 * 3, B.data()); double ttot = 0; double bytes = double(NITER) * @@ -153,7 +191,7 @@ int main() { A.data(), B_zero_point, Bp, - C_multiplier, + C_multiplier[0], C_zero_point, C_uint8.data(), col_offsets.data(), diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc index 96921a1..6c2ee17 100644 --- a/bench/DepthwiseBenchmark.cc +++ b/bench/DepthwiseBenchmark.cc @@ -17,8 +17,8 @@ #include "AlignedVec.h" #include "BenchUtils.h" -#include "fbgemm/Utils.h" #include "fbgemm/FbgemmI8DepthwiseAvx2.h" +#include "fbgemm/Utils.h" #include "src/RefImplementations.h" using namespace std; @@ -34,10 +34,11 @@ int main() { #endif // From Xray OCR + // clang-format off vector<vector<int>> shapes = { // NOTE: clang-format wants to use a different formatting but the current // formatting should be easier to read. - // N, G, H_in, W_in, stride + // N, K, H_in, W_in, stride { 1, 272, 47, 125, 1, }, { 1, 272, 64, 125, 1, }, { 1, 272, 66, 125, 1, }, @@ -138,6 +139,7 @@ int main() { { 96, 544, 14, 14, 2, }, { 100, 544, 14, 14, 2, }, }; + // clang-format on // Depthwise is memory BW bound so we want to flush LLC. bool flush = true; @@ -155,19 +157,35 @@ int main() { for (auto shape : shapes) { int N = shape[0]; - int G = shape[1]; + int K = shape[1]; int H = shape[2]; int W = shape[3]; int stride_h = shape[4]; int stride_w = stride_h; constexpr int R = 3, S = 3; - constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; + int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2, + PAD_R = (S - 1) / 2; + + conv_param_t<2> conv_p( + N, + K, + K, + {H, W}, + K, + {R, S}, + {stride_h, stride_w}, + {PAD_T, PAD_L, PAD_B, PAD_R}); + int H_OUT = conv_p.OUT_DIM[0]; + int W_OUT = conv_p.OUT_DIM[1]; + + int MDim = N * H_OUT * W_OUT; + int KDim = R * S * K; + int KDimPerGroup = KDim / conv_p.G; - aligned_vector<uint8_t> A(N * H * W * G); - aligned_vector<int8_t> B(G * R * S); - aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size()); + aligned_vector<uint8_t> A(N * H * W * K); + aligned_vector<int8_t> B(KDim); + aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size()); + aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; @@ -175,53 +193,54 @@ int main() { randFill<int8_t>(B, -16, 16); int32_t B_zero_point = 5; - depthwise_3x3_pad_1_ref( - N, - H, - W, - G, - stride_h, - stride_w, - A_zero_point, - A.data(), - B.data(), - C_ref.data()); - - int32_t minimum = *min_element(C_ref.begin(), C_ref.end()); - int32_t maximum = *max_element(C_ref.begin(), C_ref.end()); + aligned_vector<float> C_multiplier(1); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); + int32_t C_zero_point = 5; - float C_multiplier = 255. / (maximum - minimum); + vector<int32_t> row_offsets(MDim); + // im2col to compute row offset later + vector<uint8_t> A_im2col(MDim * KDim); + im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); - aligned_vector<int32_t> col_offsets(G); - aligned_vector<int32_t> bias(G); + aligned_vector<int32_t> col_offsets(K); + aligned_vector<int32_t> bias(K); randFill(col_offsets, -100, 100); randFill(bias, -40, 40); - int32_t C_zero_point = 5; - aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); - depthwise_3x3_pad_1_ref( - N, - H, - W, - G, - stride_h, - stride_w, - A_zero_point, - A.data(), - B_zero_point, - B.data(), - C_multiplier, - C_zero_point, - C_uint8_ref.data(), - col_offsets.data(), - bias.data()); + conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data()); + + for (int g = 0; g < conv_p.G; ++g) { + // Compute row offset + row_offsets_u8acc32_ref( + MDim, + KDimPerGroup, + KDim, + A_im2col.data() + g * KDimPerGroup, + row_offsets.data()); + + // Requantization + requantize_u8acc32_ref( + MDim, + 1, + conv_p.G, + C_ref.data() + g, + C_uint8_ref.data() + g, + C_multiplier.data(), + C_zero_point, + A_zero_point, + &B_zero_point, + row_offsets.data(), + col_offsets.data() + g, + bias.data() + g, + K); + } - Packed3x3ConvMatrix Bp(G, B.data()); + PackedDepthWiseConvMatrix Bp(K, 3 * 3, B.data()); double ttot = 0; double bytes = double(NITER) * - (G * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S)); - double ops = double(NITER) * N * H_OUT * W_OUT * G * R * S * 2; + (K * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S)); + double ops = double(NITER) * N * H_OUT * W_OUT * K * R * S * 2; chrono::time_point<chrono::system_clock> t_begin, t_end; for (int i = 0; i < NWARMUP + NITER; ++i) { llc_flush(); @@ -235,19 +254,20 @@ int main() { N, H, W, - G, + K, stride_h, stride_w, A_zero_point, A.data(), B_zero_point, Bp, - C_multiplier, + C_multiplier[0], C_zero_point, C_uint8.data(), col_offsets.data(), bias.data(), false, /* fuse_relu */ + 1.0f, /* act_scale * w_scale */ tid, num_threads); } @@ -262,10 +282,10 @@ int main() { for (int n = 0; n < N; ++n) { for (int h = 0; h < H_OUT; ++h) { for (int w = 0; w < W_OUT; ++w) { - for (int g = 0; g < G; ++g) { + for (int g = 0; g < K; ++g) { uint8_t expected = - C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * G + g]; - uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * G + g]; + C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * K + g]; + uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * K + g]; if (expected != actual) { cerr << "Depthwise 3x3 results differ at (" << n << ", " << h << ", " << w << ", " << g << "). expected " << (int)expected @@ -280,9 +300,9 @@ int main() { // Report performance printf( - "N = %d G = %d H = %d W = %d stride = %d with requantization fused\n", + "N = %d K = %d H = %d W = %d stride = %d with requantization fused\n", N, - G, + K, H, W, stride_h); diff --git a/bench/GEMMsBenchmark.cc b/bench/GEMMsBenchmark.cc index b404d8b..f493a96 100644 --- a/bench/GEMMsBenchmark.cc +++ b/bench/GEMMsBenchmark.cc @@ -28,6 +28,7 @@ using namespace std; using namespace fbgemm; void performance_test() { + // clang-format off static const vector<vector<int>> shapes = { // NOTE: clang-format wants to use a different formatting but the current // formatting should be easier to read. @@ -39,6 +40,7 @@ void performance_test() { {256, 512, 256}, {1024, 1024, 1024}, }; + // clang-format on bool flush = true; std::vector<char> llc; diff --git a/bench/GEMMsTunableBenchmark.cc b/bench/GEMMsTunableBenchmark.cc index a65b51f..2adc556 100644 --- a/bench/GEMMsTunableBenchmark.cc +++ b/bench/GEMMsTunableBenchmark.cc @@ -218,7 +218,8 @@ int main(int /* unused */, char** /* unused */) { } #endif - vector<vector<int>> shapes = { + // clang-format off + vector<vector<int>> shapes = { // NOTE: clang-format wants to use a different formatting but the current // formatting should be easier to read. // m, n, k @@ -266,7 +267,8 @@ int main(int /* unused */, char** /* unused */) { {128, 128, 128}, {256, 512, 256}, {1024, 1024, 1024}, -}; + }; + // clang-format on vector<int> MCBs; vector<int> NCBs; diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc index 66ca67e..dcea65c 100644 --- a/bench/PackedFloatInOutBenchmark.cc +++ b/bench/PackedFloatInOutBenchmark.cc @@ -28,6 +28,7 @@ using namespace std; using namespace fbgemm; void performance_test() { + // clang-format off vector<vector<int>> shapes = { // NOTE: clang-format wants to use a different formatting but the current // formatting should be easier to read. @@ -66,6 +67,7 @@ void performance_test() { {1, 128, 2722}, {16, 256, 512}, }; + // clang-format on bool flush = true; std::vector<char> llc; diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc index 40ff662..c6e2869 100644 --- a/bench/PackedRequantizeAcc16Benchmark.cc +++ b/bench/PackedRequantizeAcc16Benchmark.cc @@ -37,6 +37,7 @@ enum class BenchmarkType { }; void performance_test() { + // clang-format off vector<vector<int>> shapes = { // NOTE: clang-format wants to use a different formatting but the current // formatting should be easier to read. @@ -67,6 +68,7 @@ void performance_test() { {392, 2048, 512}, {392, 512, 2048}, }; + // clang-format on bool flush = true; std::vector<char> llc; diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc index 2f04795..a61ef5a 100644 --- a/bench/PackedRequantizeAcc32Benchmark.cc +++ b/bench/PackedRequantizeAcc32Benchmark.cc @@ -28,6 +28,7 @@ using namespace std; using namespace fbgemm; void performance_test() { + // clang-format off vector<vector<int>> shapes = { // NOTE: clang-format wants to use a different formatting but the current // formatting should be easier to read. @@ -70,6 +71,7 @@ void performance_test() { {1, 128, 2722}, {16, 256, 512}, }; + // clang-format on bool flush = true; std::vector<char> llc; |