Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/bench
diff options
context:
space:
mode:
authorJongsoo Park <jongsoo@fb.com>2019-09-04 22:04:31 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-09-04 22:06:43 +0300
commit624d098f6701a951d02f0ad651ddba106442eb7c (patch)
tree67249705eaea0b8931051d4b5e199dafc74ceb66 /bench
parentab2d5278073fce692b4f3b95d164f8cb3bbb1f72 (diff)
remove dw conv refs and use conv_ref instead (#122)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/122 To prepare depth-wise convolution other than 3x3. The existing reference depth-wise convolution is limited to 3x3 and we should reuse conv_ref implementation for easier maintenance. Reviewed By: dskhudia Differential Revision: D17176591 fbshipit-source-id: 9f6f90a801a0ad95091f1d085e66861f86c3a8f1
Diffstat (limited to 'bench')
-rw-r--r--bench/Depthwise3DBenchmark.cc130
-rw-r--r--bench/DepthwiseBenchmark.cc125
2 files changed, 155 insertions, 100 deletions
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index 0efdcac..cd31524 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -4,7 +4,6 @@
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
-#include "test/I8DepthwiseTest.h"
#include <algorithm>
#include <chrono>
@@ -19,8 +18,8 @@
#include "AlignedVec.h"
#include "BenchUtils.h"
-#include "fbgemm/Utils.h"
#include "fbgemm/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/Utils.h"
#include "src/RefImplementations.h"
using namespace std;
@@ -35,6 +34,34 @@ int main() {
}
#endif
+ // From ResNeXt-3D-101
+ // clang-format off
+ vector<vector<int>> shapes_3d = {
+ // NOTE: clang-format wants to use a different formatting but the current
+ // formatting should be easier to read.
+ // N, K, T_in, H_in, W_in, stride
+ { 1, 64, 32, 56, 56, 1, },
+ { 1, 128, 16, 28, 28, 1, },
+ { 1, 256, 8, 14, 14, 1, },
+ { 1, 512, 4, 7, 7, 1, },
+
+ { 1, 128, 32, 56, 56, 2, },
+ { 1, 256, 16, 28, 28, 2, },
+ { 1, 512, 8, 14, 14, 2, },
+
+ { 5, 64, 32, 56, 56, 1, },
+ { 5, 128, 16, 28, 28, 1, },
+ { 5, 256, 8, 14, 14, 1, },
+ { 5, 512, 4, 7, 7, 1, },
+
+ { 5, 128, 32, 56, 56, 2, },
+ { 5, 256, 16, 28, 28, 2, },
+ { 5, 512, 8, 14, 14, 2, },
+
+ { 1, 8, 4, 4, 4, 1, },
+ };
+ // clang-format on
+
// Depthwise is memory BW bound so we want to flush LLC.
bool flush = true;
std::vector<char> llc;
@@ -61,14 +88,28 @@ int main() {
constexpr int K_T = 3, K_H = 3, K_W = 3;
constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1,
PAD_R = 1;
- int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1;
- int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1;
- int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1;
+
+ conv_param_t<3> conv_p(
+ N,
+ K,
+ K,
+ {T, H, W},
+ K,
+ {K_T, K_H, K_W},
+ {stride_t, stride_h, stride_w},
+ {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R});
+ int T_OUT = conv_p.OUT_DIM[0];
+ int H_OUT = conv_p.OUT_DIM[1];
+ int W_OUT = conv_p.OUT_DIM[2];
+
+ int MDim = N * T_OUT * H_OUT * W_OUT;
+ int KDim = K_T * K_H * K_W * K;
+ int KDimPerGroup = KDim / conv_p.G;
aligned_vector<uint8_t> A(N * T * H * W * K);
- aligned_vector<int8_t> B(K * K_T * K_H * K_W);
- aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
- C(C_ref.size());
+ aligned_vector<int8_t> B(KDim);
+ aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
+ aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
randFill<uint8_t>(A, 0, 86);
int32_t A_zero_point = 43;
@@ -76,50 +117,47 @@ int main() {
randFill<int8_t>(B, -16, 16);
int32_t B_zero_point = 5;
- depthwise_3x3x3_pad_1_ref(
- N,
- T,
- H,
- W,
- K,
- stride_t,
- stride_h,
- stride_w,
- A_zero_point,
- A.data(),
- B.data(),
- C_ref.data());
-
- int32_t minimum = *min_element(C_ref.begin(), C_ref.end());
- int32_t maximum = *max_element(C_ref.begin(), C_ref.end());
+ aligned_vector<float> C_multiplier(1);
+ randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
+ int32_t C_zero_point = 5;
- float C_multiplier = 255. / (maximum - minimum);
+ vector<int32_t> row_offsets(MDim);
+ // im2col to compute row offset later
+ vector<uint8_t> A_im2col(MDim * KDim);
+ im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
aligned_vector<int32_t> col_offsets(K);
aligned_vector<int32_t> bias(K);
randFill(col_offsets, -100, 100);
randFill(bias, -40, 40);
- int32_t C_zero_point = 5;
- aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
- depthwise_3x3x3_pad_1_ref(
- N,
- T,
- H,
- W,
- K,
- stride_t,
- stride_h,
- stride_w,
- A_zero_point,
- A.data(),
- B_zero_point,
- B.data(),
- C_multiplier,
- C_zero_point,
- C_uint8_ref.data(),
- col_offsets.data(),
- bias.data());
+ conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
+
+ for (int g = 0; g < conv_p.G; ++g) {
+ // Compute row offset
+ row_offsets_u8acc32_ref(
+ MDim,
+ KDimPerGroup,
+ KDim,
+ A_im2col.data() + g * KDimPerGroup,
+ row_offsets.data());
+
+ // Requantization
+ requantize_u8acc32_ref(
+ MDim,
+ 1,
+ conv_p.G,
+ C_ref.data() + g,
+ C_uint8_ref.data() + g,
+ C_multiplier.data(),
+ C_zero_point,
+ A_zero_point,
+ &B_zero_point,
+ row_offsets.data(),
+ col_offsets.data() + g,
+ bias.data() + g,
+ K);
+ }
Packed3x3x3ConvMatrix Bp(K, B.data());
@@ -153,7 +191,7 @@ int main() {
A.data(),
B_zero_point,
Bp,
- C_multiplier,
+ C_multiplier[0],
C_zero_point,
C_uint8.data(),
col_offsets.data(),
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index e882890..4f36e6c 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -17,8 +17,8 @@
#include "AlignedVec.h"
#include "BenchUtils.h"
-#include "fbgemm/Utils.h"
#include "fbgemm/FbgemmI8DepthwiseAvx2.h"
+#include "fbgemm/Utils.h"
#include "src/RefImplementations.h"
using namespace std;
@@ -38,7 +38,7 @@ int main() {
vector<vector<int>> shapes = {
// NOTE: clang-format wants to use a different formatting but the current
// formatting should be easier to read.
- // N, G, H_in, W_in, stride
+ // N, K, H_in, W_in, stride
{ 1, 272, 47, 125, 1, },
{ 1, 272, 64, 125, 1, },
{ 1, 272, 66, 125, 1, },
@@ -157,19 +157,35 @@ int main() {
for (auto shape : shapes) {
int N = shape[0];
- int G = shape[1];
+ int K = shape[1];
int H = shape[2];
int W = shape[3];
int stride_h = shape[4];
int stride_w = stride_h;
constexpr int R = 3, S = 3;
- constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1;
- int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1;
- int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1;
+ int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2,
+ PAD_R = (S - 1) / 2;
+
+ conv_param_t<2> conv_p(
+ N,
+ K,
+ K,
+ {H, W},
+ K,
+ {R, S},
+ {stride_h, stride_w},
+ {PAD_T, PAD_L, PAD_B, PAD_R});
+ int H_OUT = conv_p.OUT_DIM[0];
+ int W_OUT = conv_p.OUT_DIM[1];
+
+ int MDim = N * H_OUT * W_OUT;
+ int KDim = R * S * K;
+ int KDimPerGroup = KDim / conv_p.G;
- aligned_vector<uint8_t> A(N * H * W * G);
- aligned_vector<int8_t> B(G * R * S);
- aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size());
+ aligned_vector<uint8_t> A(N * H * W * K);
+ aligned_vector<int8_t> B(KDim);
+ aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size());
+ aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
randFill<uint8_t>(A, 0, 86);
int32_t A_zero_point = 43;
@@ -177,53 +193,54 @@ int main() {
randFill<int8_t>(B, -16, 16);
int32_t B_zero_point = 5;
- depthwise_3x3_pad_1_ref(
- N,
- H,
- W,
- G,
- stride_h,
- stride_w,
- A_zero_point,
- A.data(),
- B.data(),
- C_ref.data());
-
- int32_t minimum = *min_element(C_ref.begin(), C_ref.end());
- int32_t maximum = *max_element(C_ref.begin(), C_ref.end());
+ aligned_vector<float> C_multiplier(1);
+ randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
+ int32_t C_zero_point = 5;
- float C_multiplier = 255. / (maximum - minimum);
+ vector<int32_t> row_offsets(MDim);
+ // im2col to compute row offset later
+ vector<uint8_t> A_im2col(MDim * KDim);
+ im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data());
- aligned_vector<int32_t> col_offsets(G);
- aligned_vector<int32_t> bias(G);
+ aligned_vector<int32_t> col_offsets(K);
+ aligned_vector<int32_t> bias(K);
randFill(col_offsets, -100, 100);
randFill(bias, -40, 40);
- int32_t C_zero_point = 5;
- aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size());
- depthwise_3x3_pad_1_ref(
- N,
- H,
- W,
- G,
- stride_h,
- stride_w,
- A_zero_point,
- A.data(),
- B_zero_point,
- B.data(),
- C_multiplier,
- C_zero_point,
- C_uint8_ref.data(),
- col_offsets.data(),
- bias.data());
+ conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data());
+
+ for (int g = 0; g < conv_p.G; ++g) {
+ // Compute row offset
+ row_offsets_u8acc32_ref(
+ MDim,
+ KDimPerGroup,
+ KDim,
+ A_im2col.data() + g * KDimPerGroup,
+ row_offsets.data());
+
+ // Requantization
+ requantize_u8acc32_ref(
+ MDim,
+ 1,
+ conv_p.G,
+ C_ref.data() + g,
+ C_uint8_ref.data() + g,
+ C_multiplier.data(),
+ C_zero_point,
+ A_zero_point,
+ &B_zero_point,
+ row_offsets.data(),
+ col_offsets.data() + g,
+ bias.data() + g,
+ K);
+ }
- Packed3x3ConvMatrix Bp(G, B.data());
+ Packed3x3ConvMatrix Bp(K, B.data());
double ttot = 0;
double bytes = double(NITER) *
- (G * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S));
- double ops = double(NITER) * N * H_OUT * W_OUT * G * R * S * 2;
+ (K * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S));
+ double ops = double(NITER) * N * H_OUT * W_OUT * K * R * S * 2;
chrono::time_point<chrono::system_clock> t_begin, t_end;
for (int i = 0; i < NWARMUP + NITER; ++i) {
llc_flush();
@@ -237,14 +254,14 @@ int main() {
N,
H,
W,
- G,
+ K,
stride_h,
stride_w,
A_zero_point,
A.data(),
B_zero_point,
Bp,
- C_multiplier,
+ C_multiplier[0],
C_zero_point,
C_uint8.data(),
col_offsets.data(),
@@ -264,10 +281,10 @@ int main() {
for (int n = 0; n < N; ++n) {
for (int h = 0; h < H_OUT; ++h) {
for (int w = 0; w < W_OUT; ++w) {
- for (int g = 0; g < G; ++g) {
+ for (int g = 0; g < K; ++g) {
uint8_t expected =
- C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * G + g];
- uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * G + g];
+ C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * K + g];
+ uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * K + g];
if (expected != actual) {
cerr << "Depthwise 3x3 results differ at (" << n << ", " << h
<< ", " << w << ", " << g << "). expected " << (int)expected
@@ -282,9 +299,9 @@ int main() {
// Report performance
printf(
- "N = %d G = %d H = %d W = %d stride = %d with requantization fused\n",
+ "N = %d K = %d H = %d W = %d stride = %d with requantization fused\n",
N,
- G,
+ K,
H,
W,
stride_h);