diff options
author | Jongsoo Park <jongsoo@fb.com> | 2019-09-04 22:04:31 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-09-04 22:06:43 +0300 |
commit | 624d098f6701a951d02f0ad651ddba106442eb7c (patch) | |
tree | 67249705eaea0b8931051d4b5e199dafc74ceb66 | |
parent | ab2d5278073fce692b4f3b95d164f8cb3bbb1f72 (diff) |
remove dw conv refs and use conv_ref instead (#122)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/122
To prepare depth-wise convolution other than 3x3.
The existing reference depth-wise convolution is limited to 3x3 and we should reuse conv_ref implementation for easier maintenance.
Reviewed By: dskhudia
Differential Revision: D17176591
fbshipit-source-id: 9f6f90a801a0ad95091f1d085e66861f86c3a8f1
-rw-r--r-- | bench/Depthwise3DBenchmark.cc | 130 | ||||
-rw-r--r-- | bench/DepthwiseBenchmark.cc | 125 | ||||
-rw-r--r-- | include/fbgemm/FbgemmI8DepthwiseAvx2.h | 43 | ||||
-rw-r--r-- | src/RefImplementations.cc | 400 | ||||
-rw-r--r-- | src/RefImplementations.h | 120 | ||||
-rw-r--r-- | test/I8DepthwiseTest.cc | 426 | ||||
-rw-r--r-- | test/I8DepthwiseTest.h | 41 |
7 files changed, 403 insertions, 882 deletions
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc index 0efdcac..cd31524 100644 --- a/bench/Depthwise3DBenchmark.cc +++ b/bench/Depthwise3DBenchmark.cc @@ -4,7 +4,6 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ -#include "test/I8DepthwiseTest.h" #include <algorithm> #include <chrono> @@ -19,8 +18,8 @@ #include "AlignedVec.h" #include "BenchUtils.h" -#include "fbgemm/Utils.h" #include "fbgemm/FbgemmI8DepthwiseAvx2.h" +#include "fbgemm/Utils.h" #include "src/RefImplementations.h" using namespace std; @@ -35,6 +34,34 @@ int main() { } #endif + // From ResNeXt-3D-101 + // clang-format off + vector<vector<int>> shapes_3d = { + // NOTE: clang-format wants to use a different formatting but the current + // formatting should be easier to read. + // N, K, T_in, H_in, W_in, stride + { 1, 64, 32, 56, 56, 1, }, + { 1, 128, 16, 28, 28, 1, }, + { 1, 256, 8, 14, 14, 1, }, + { 1, 512, 4, 7, 7, 1, }, + + { 1, 128, 32, 56, 56, 2, }, + { 1, 256, 16, 28, 28, 2, }, + { 1, 512, 8, 14, 14, 2, }, + + { 5, 64, 32, 56, 56, 1, }, + { 5, 128, 16, 28, 28, 1, }, + { 5, 256, 8, 14, 14, 1, }, + { 5, 512, 4, 7, 7, 1, }, + + { 5, 128, 32, 56, 56, 2, }, + { 5, 256, 16, 28, 28, 2, }, + { 5, 512, 8, 14, 14, 2, }, + + { 1, 8, 4, 4, 4, 1, }, + }; + // clang-format on + // Depthwise is memory BW bound so we want to flush LLC. bool flush = true; std::vector<char> llc; @@ -61,14 +88,28 @@ int main() { constexpr int K_T = 3, K_H = 3, K_W = 3; constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1; - int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; + + conv_param_t<3> conv_p( + N, + K, + K, + {T, H, W}, + K, + {K_T, K_H, K_W}, + {stride_t, stride_h, stride_w}, + {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R}); + int T_OUT = conv_p.OUT_DIM[0]; + int H_OUT = conv_p.OUT_DIM[1]; + int W_OUT = conv_p.OUT_DIM[2]; + + int MDim = N * T_OUT * H_OUT * W_OUT; + int KDim = K_T * K_H * K_W * K; + int KDimPerGroup = KDim / conv_p.G; aligned_vector<uint8_t> A(N * T * H * W * K); - aligned_vector<int8_t> B(K * K_T * K_H * K_W); - aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K), - C(C_ref.size()); + aligned_vector<int8_t> B(KDim); + aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size()); + aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; @@ -76,50 +117,47 @@ int main() { randFill<int8_t>(B, -16, 16); int32_t B_zero_point = 5; - depthwise_3x3x3_pad_1_ref( - N, - T, - H, - W, - K, - stride_t, - stride_h, - stride_w, - A_zero_point, - A.data(), - B.data(), - C_ref.data()); - - int32_t minimum = *min_element(C_ref.begin(), C_ref.end()); - int32_t maximum = *max_element(C_ref.begin(), C_ref.end()); + aligned_vector<float> C_multiplier(1); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); + int32_t C_zero_point = 5; - float C_multiplier = 255. / (maximum - minimum); + vector<int32_t> row_offsets(MDim); + // im2col to compute row offset later + vector<uint8_t> A_im2col(MDim * KDim); + im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); aligned_vector<int32_t> col_offsets(K); aligned_vector<int32_t> bias(K); randFill(col_offsets, -100, 100); randFill(bias, -40, 40); - int32_t C_zero_point = 5; - aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); - depthwise_3x3x3_pad_1_ref( - N, - T, - H, - W, - K, - stride_t, - stride_h, - stride_w, - A_zero_point, - A.data(), - B_zero_point, - B.data(), - C_multiplier, - C_zero_point, - C_uint8_ref.data(), - col_offsets.data(), - bias.data()); + conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data()); + + for (int g = 0; g < conv_p.G; ++g) { + // Compute row offset + row_offsets_u8acc32_ref( + MDim, + KDimPerGroup, + KDim, + A_im2col.data() + g * KDimPerGroup, + row_offsets.data()); + + // Requantization + requantize_u8acc32_ref( + MDim, + 1, + conv_p.G, + C_ref.data() + g, + C_uint8_ref.data() + g, + C_multiplier.data(), + C_zero_point, + A_zero_point, + &B_zero_point, + row_offsets.data(), + col_offsets.data() + g, + bias.data() + g, + K); + } Packed3x3x3ConvMatrix Bp(K, B.data()); @@ -153,7 +191,7 @@ int main() { A.data(), B_zero_point, Bp, - C_multiplier, + C_multiplier[0], C_zero_point, C_uint8.data(), col_offsets.data(), diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc index e882890..4f36e6c 100644 --- a/bench/DepthwiseBenchmark.cc +++ b/bench/DepthwiseBenchmark.cc @@ -17,8 +17,8 @@ #include "AlignedVec.h" #include "BenchUtils.h" -#include "fbgemm/Utils.h" #include "fbgemm/FbgemmI8DepthwiseAvx2.h" +#include "fbgemm/Utils.h" #include "src/RefImplementations.h" using namespace std; @@ -38,7 +38,7 @@ int main() { vector<vector<int>> shapes = { // NOTE: clang-format wants to use a different formatting but the current // formatting should be easier to read. - // N, G, H_in, W_in, stride + // N, K, H_in, W_in, stride { 1, 272, 47, 125, 1, }, { 1, 272, 64, 125, 1, }, { 1, 272, 66, 125, 1, }, @@ -157,19 +157,35 @@ int main() { for (auto shape : shapes) { int N = shape[0]; - int G = shape[1]; + int K = shape[1]; int H = shape[2]; int W = shape[3]; int stride_h = shape[4]; int stride_w = stride_h; constexpr int R = 3, S = 3; - constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; + int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2, + PAD_R = (S - 1) / 2; + + conv_param_t<2> conv_p( + N, + K, + K, + {H, W}, + K, + {R, S}, + {stride_h, stride_w}, + {PAD_T, PAD_L, PAD_B, PAD_R}); + int H_OUT = conv_p.OUT_DIM[0]; + int W_OUT = conv_p.OUT_DIM[1]; + + int MDim = N * H_OUT * W_OUT; + int KDim = R * S * K; + int KDimPerGroup = KDim / conv_p.G; - aligned_vector<uint8_t> A(N * H * W * G); - aligned_vector<int8_t> B(G * R * S); - aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size()); + aligned_vector<uint8_t> A(N * H * W * K); + aligned_vector<int8_t> B(KDim); + aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size()); + aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; @@ -177,53 +193,54 @@ int main() { randFill<int8_t>(B, -16, 16); int32_t B_zero_point = 5; - depthwise_3x3_pad_1_ref( - N, - H, - W, - G, - stride_h, - stride_w, - A_zero_point, - A.data(), - B.data(), - C_ref.data()); - - int32_t minimum = *min_element(C_ref.begin(), C_ref.end()); - int32_t maximum = *max_element(C_ref.begin(), C_ref.end()); + aligned_vector<float> C_multiplier(1); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); + int32_t C_zero_point = 5; - float C_multiplier = 255. / (maximum - minimum); + vector<int32_t> row_offsets(MDim); + // im2col to compute row offset later + vector<uint8_t> A_im2col(MDim * KDim); + im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); - aligned_vector<int32_t> col_offsets(G); - aligned_vector<int32_t> bias(G); + aligned_vector<int32_t> col_offsets(K); + aligned_vector<int32_t> bias(K); randFill(col_offsets, -100, 100); randFill(bias, -40, 40); - int32_t C_zero_point = 5; - aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); - depthwise_3x3_pad_1_ref( - N, - H, - W, - G, - stride_h, - stride_w, - A_zero_point, - A.data(), - B_zero_point, - B.data(), - C_multiplier, - C_zero_point, - C_uint8_ref.data(), - col_offsets.data(), - bias.data()); + conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data()); + + for (int g = 0; g < conv_p.G; ++g) { + // Compute row offset + row_offsets_u8acc32_ref( + MDim, + KDimPerGroup, + KDim, + A_im2col.data() + g * KDimPerGroup, + row_offsets.data()); + + // Requantization + requantize_u8acc32_ref( + MDim, + 1, + conv_p.G, + C_ref.data() + g, + C_uint8_ref.data() + g, + C_multiplier.data(), + C_zero_point, + A_zero_point, + &B_zero_point, + row_offsets.data(), + col_offsets.data() + g, + bias.data() + g, + K); + } - Packed3x3ConvMatrix Bp(G, B.data()); + Packed3x3ConvMatrix Bp(K, B.data()); double ttot = 0; double bytes = double(NITER) * - (G * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S)); - double ops = double(NITER) * N * H_OUT * W_OUT * G * R * S * 2; + (K * (N * (2 * sizeof(int32_t) * H_OUT * W_OUT + H * W) + R * S)); + double ops = double(NITER) * N * H_OUT * W_OUT * K * R * S * 2; chrono::time_point<chrono::system_clock> t_begin, t_end; for (int i = 0; i < NWARMUP + NITER; ++i) { llc_flush(); @@ -237,14 +254,14 @@ int main() { N, H, W, - G, + K, stride_h, stride_w, A_zero_point, A.data(), B_zero_point, Bp, - C_multiplier, + C_multiplier[0], C_zero_point, C_uint8.data(), col_offsets.data(), @@ -264,10 +281,10 @@ int main() { for (int n = 0; n < N; ++n) { for (int h = 0; h < H_OUT; ++h) { for (int w = 0; w < W_OUT; ++w) { - for (int g = 0; g < G; ++g) { + for (int g = 0; g < K; ++g) { uint8_t expected = - C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * G + g]; - uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * G + g]; + C_uint8_ref[((n * H_OUT + h) * W_OUT + w) * K + g]; + uint8_t actual = C_uint8[((n * H_OUT + h) * W_OUT + w) * K + g]; if (expected != actual) { cerr << "Depthwise 3x3 results differ at (" << n << ", " << h << ", " << w << ", " << g << "). expected " << (int)expected @@ -282,9 +299,9 @@ int main() { // Report performance printf( - "N = %d G = %d H = %d W = %d stride = %d with requantization fused\n", + "N = %d K = %d H = %d W = %d stride = %d with requantization fused\n", N, - G, + K, H, W, stride_h); diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h index e7b0ec4..98c4ed7 100644 --- a/include/fbgemm/FbgemmI8DepthwiseAvx2.h +++ b/include/fbgemm/FbgemmI8DepthwiseAvx2.h @@ -51,27 +51,8 @@ using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>; using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>; /** - * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8 - * @params A The input image in NHWK layout - * @params Bp The pre-packed filter - */ -FBGEMM_API void depthwise_3x3_pad_1( - int N, - int H, - int W, - int K, - int stride_h, - int stride_w, - std::int32_t A_zero_point, - const std::uint8_t* A, - const Packed3x3ConvMatrix& Bp, - std::int32_t* C, - int thread_id = 0, - int num_threads = 1); - -/** - * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8 - * This version is fused with requantization. + * Depth-wise 3x3 convolution with pad=1 and K a multiple of 8, fused with + * requantization. * * @col_offsets nullptr if col_offsets are folded into bias */ @@ -96,8 +77,8 @@ FBGEMM_API void depthwise_3x3_pad_1( int num_threads = 1); /** - * Depth-wise 3x3 convolution with pad=1 and stride=1 and K a multiple of 8 - * This version is fused with requantization and uses per-channel quantization. + * Depth-wise 3x3 convolution with pad=1 and K a multiple of 8, fused with + * requantization, and using per-channel quantization. * * @col_offsets nullptr if col_offsets are folded into bias */ @@ -121,22 +102,6 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1( int thread_id = 0, int num_threads = 1); -FBGEMM_API void depthwise_3x3x3_pad_1( - int N, - int T, - int H, - int W, - int K, - int stride_t, - int stride_h, - int stride_w, - std::int32_t A_zero_point, - const std::uint8_t* A, - const Packed3x3x3ConvMatrix& Bp, - std::int32_t* C, - int thread_id = 0, - int num_threads = 1); - /** * @col_offsets nullptr if col_offsets are folded into bias */ diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index da58eba..dc40d44 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -600,406 +600,6 @@ void transposeConvWeights( } } -void depthwise_3x3_pad_1_ref( - int N, - int H, - int W, - int K, - int stride_h, - int stride_w, - int32_t A_zero_point, - const uint8_t* A, - const int8_t* B, - int32_t* C) { - constexpr int R = 3, S = 3; - constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; - - for (int n = 0; n < N; ++n) { - for (int h = 0; h < H_OUT; ++h) { - for (int w = 0; w < W_OUT; ++w) { - for (int k = 0; k < K; ++k) { - int sum = 0; - for (int r = 0; r < R; ++r) { - int h_in = -PAD_T + h * stride_h + r; - for (int s = 0; s < S; ++s) { - int w_in = -PAD_L + w * stride_w + s; - int a = h_in < 0 || h_in >= H || w_in < 0 || w_in >= W - ? A_zero_point - : A[((n * H + h_in) * W + w_in) * K + k]; - int b = B[(k * R + r) * S + s]; - sum += a * b; - } - } - C[((n * H_OUT + h) * W_OUT + w) * K + k] = sum; - } - } - } - } // for each n -}; - -void depthwise_3x3_pad_1_ref( - int N, - int H, - int W, - int K, - int stride_h, - int stride_w, - int32_t A_zero_point, - const uint8_t* A, - int32_t B_zero_point, - const int8_t* B, - float C_multiplier, - int32_t C_zero_point, - uint8_t* C, - const int32_t* col_offsets, - const int32_t* bias) { - constexpr int R = 3, S = 3; - constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; - - vector<int32_t> C_int32(N * H_OUT * W_OUT * K); - depthwise_3x3_pad_1_ref( - N, H, W, K, stride_h, stride_w, A_zero_point, A, B, C_int32.data()); - - vector<int32_t> row_offsets(N * H_OUT * W_OUT * K); - for (int n = 0; n < N; ++n) { - for (int h = 0; h < H_OUT; ++h) { - for (int w = 0; w < W_OUT; ++w) { - for (int k = 0; k < K; ++k) { - int sum = 0; - for (int r = 0; r < R; ++r) { - int h_in = -PAD_T + h * stride_h + r; - for (int s = 0; s < S; ++s) { - int w_in = -PAD_L + w * stride_w + s; - int a = h_in < 0 || h_in >= H || w_in < 0 || w_in >= W - ? A_zero_point - : A[((n * H + h_in) * W + w_in) * K + k]; - sum += a; - } - } - row_offsets[((n * H_OUT + h) * W_OUT + w) * K + k] = sum; - } - } - } - } // for each n - - for (int i = 0; i < N * H_OUT * W_OUT; ++i) { - for (int k = 0; k < K; ++k) { - requantize_u8acc32_ref( - 1, - 1, - 1, - C_int32.data() + i * K + k, - C + i * K + k, - &C_multiplier, - C_zero_point, - A_zero_point, - &B_zero_point, - &row_offsets[i * K + k], - col_offsets + k, - bias ? bias + k : nullptr, - 1); - } - } -}; - -void depthwise_3x3_per_channel_quantization_pad_1_ref( - int N, - int H, - int W, - int K, - int stride_h, - int stride_w, - int32_t A_zero_point, - const uint8_t* A, - const int32_t* B_zero_point, - const int8_t* B, - const float* C_multiplier, - int32_t C_zero_point, - uint8_t* C, - const int32_t* col_offsets, - const int32_t* bias) { - constexpr int R = 3, S = 3; - constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; - - vector<int32_t> C_int32(N * H_OUT * W_OUT * K); - depthwise_3x3_pad_1_ref( - N, H, W, K, stride_h, stride_w, A_zero_point, A, B, C_int32.data()); - - vector<int32_t> row_offsets(N * H_OUT * W_OUT * K); - for (int n = 0; n < N; ++n) { - for (int h = 0; h < H_OUT; ++h) { - for (int w = 0; w < W_OUT; ++w) { - for (int k = 0; k < K; ++k) { - int sum = 0; - for (int r = 0; r < R; ++r) { - int h_in = -PAD_T + h * stride_h + r; - for (int s = 0; s < S; ++s) { - int w_in = -PAD_L + w * stride_w + s; - int a = h_in < 0 || h_in >= H || w_in < 0 || w_in >= W - ? A_zero_point - : A[((n * H + h_in) * W + w_in) * K + k]; - sum += a; - } - } - row_offsets[((n * H_OUT + h) * W_OUT + w) * K + k] = sum; - } - } - } - } // for each n - - for (int i = 0; i < N * H_OUT * W_OUT; ++i) { - for (int k = 0; k < K; ++k) { - requantize_u8acc32_ref( - 1, - 1, - 1, - C_int32.data() + i * K + k, - C + i * K + k, - &C_multiplier[k], - C_zero_point, - A_zero_point, - &B_zero_point[k], - &row_offsets[i * K + k], - col_offsets + k, - bias ? bias + k : nullptr, - 1); - } - } -}; - -void depthwise_3x3x3_pad_1_ref( - int N, - int T, - int H, - int W, - int K, - int stride_t, - int stride_h, - int stride_w, - int32_t A_zero_point, - const uint8_t* A, - const int8_t* B, - int32_t* C) { - constexpr int K_T = 3, K_H = 3, K_W = 3; - constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, - PAD_R = 1; - int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1; - int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; - - for (int n = 0; n < N; ++n) { - for (int t = 0; t < T_OUT; ++t) { - for (int h = 0; h < H_OUT; ++h) { - for (int w = 0; w < W_OUT; ++w) { - for (int k = 0; k < K; ++k) { - int sum = 0; - for (int k_t = 0; k_t < K_T; ++k_t) { - int t_in = -PAD_P + t * stride_t + k_t; - for (int k_h = 0; k_h < K_H; ++k_h) { - int h_in = -PAD_T + h * stride_h + k_h; - for (int k_w = 0; k_w < K_W; ++k_w) { - int w_in = -PAD_L + w * stride_w + k_w; - int a = t_in < 0 || t_in >= T || h_in < 0 || h_in >= H || - w_in < 0 || w_in >= W - ? A_zero_point - : A[(((n * T + t_in) * H + h_in) * W + w_in) * K + k]; - int b = B[((k * K_T + k_t) * K_H + k_h) * K_W + k_w]; - sum += a * b; - } - } - } - C[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k] = sum; - } - } // w - } // h - } // t - } // for each n -}; - -void depthwise_3x3x3_pad_1_ref( - int N, - int T, - int H, - int W, - int K, - int stride_t, - int stride_h, - int stride_w, - int32_t A_zero_point, - const uint8_t* A, - int32_t B_zero_point, - const int8_t* B, - float C_multiplier, - int32_t C_zero_point, - uint8_t* C, - const int32_t* col_offsets, - const int32_t* bias) { - constexpr int K_T = 3, K_H = 3, K_W = 3; - constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, - PAD_R = 1; - int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1; - int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; - - vector<int32_t> C_int32(N * T_OUT * H_OUT * W_OUT * K); - depthwise_3x3x3_pad_1_ref( - N, - T, - H, - W, - K, - stride_t, - stride_h, - stride_w, - A_zero_point, - A, - B, - C_int32.data()); - - vector<int32_t> row_offsets(N * T_OUT * H_OUT * W_OUT * K); - for (int n = 0; n < N; ++n) { - for (int t = 0; t < T_OUT; ++t) { - for (int h = 0; h < H_OUT; ++h) { - for (int w = 0; w < W_OUT; ++w) { - for (int k = 0; k < K; ++k) { - int sum = 0; - for (int k_t = 0; k_t < K_T; ++k_t) { - int t_in = -PAD_P + t * stride_t + k_t; - for (int k_h = 0; k_h < K_H; ++k_h) { - int h_in = -PAD_T + h * stride_h + k_h; - for (int k_w = 0; k_w < K_W; ++k_w) { - int w_in = -PAD_L + w * stride_w + k_w; - int a = t_in < 0 || t_in >= T || h_in < 0 || h_in >= H || - w_in < 0 || w_in >= W - ? A_zero_point - : A[(((n * T + t_in) * H + h_in) * W + w_in) * K + k]; - sum += a; - } - } - } - row_offsets[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k] = - sum; - } - } // w - } // h - } // t - } // for each n - - for (int i = 0; i < N * T_OUT * H_OUT * W_OUT; ++i) { - for (int k = 0; k < K; ++k) { - requantize_u8acc32_ref( - 1, - 1, - 1, - C_int32.data() + i * K + k, - C + i * K + k, - &C_multiplier, - C_zero_point, - A_zero_point, - &B_zero_point, - &row_offsets[i * K + k], - col_offsets + k, - bias ? bias + k : nullptr, - 1); - } - } -}; - -void depthwise_3x3x3_per_channel_quantization_pad_1_ref( - int N, - int T, - int H, - int W, - int K, - int stride_t, - int stride_h, - int stride_w, - int32_t A_zero_point, - const uint8_t* A, - const int32_t* B_zero_point, - const int8_t* B, - const float* C_multiplier, - int32_t C_zero_point, - uint8_t* C, - const int32_t* col_offsets, - const int32_t* bias) { - constexpr int K_T = 3, K_H = 3, K_W = 3; - constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, - PAD_R = 1; - int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1; - int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; - - vector<int32_t> C_int32(N * T_OUT * H_OUT * W_OUT * K); - depthwise_3x3x3_pad_1_ref( - N, - T, - H, - W, - K, - stride_t, - stride_h, - stride_w, - A_zero_point, - A, - B, - C_int32.data()); - - vector<int32_t> row_offsets(N * T_OUT * H_OUT * W_OUT * K); - for (int n = 0; n < N; ++n) { - for (int t = 0; t < T_OUT; ++t) { - for (int h = 0; h < H_OUT; ++h) { - for (int w = 0; w < W_OUT; ++w) { - for (int k = 0; k < K; ++k) { - int sum = 0; - for (int k_t = 0; k_t < K_T; ++k_t) { - int t_in = -PAD_P + t * stride_t + k_t; - for (int k_h = 0; k_h < K_H; ++k_h) { - int h_in = -PAD_T + h * stride_h + k_h; - for (int k_w = 0; k_w < K_W; ++k_w) { - int w_in = -PAD_L + w * stride_w + k_w; - int a = t_in < 0 || t_in >= T || h_in < 0 || h_in >= H || - w_in < 0 || w_in >= W - ? A_zero_point - : A[(((n * T + t_in) * H + h_in) * W + w_in) * K + k]; - sum += a; - } - } - } - row_offsets[(((n * T_OUT + t) * H_OUT + h) * W_OUT + w) * K + k] = - sum; - } - } // w - } // h - } // t - } // for each n - - for (int i = 0; i < N * T_OUT * H_OUT * W_OUT; ++i) { - for (int k = 0; k < K; ++k) { - requantize_u8acc32_ref( - 1, - 1, - 1, - C_int32.data() + i * K + k, - C + i * K + k, - &C_multiplier[k], - C_zero_point, - A_zero_point, - &B_zero_point[k], - &row_offsets[i * K + k], - col_offsets + k, - bias ? bias + k : nullptr, - 1); - } - } -}; - template void transposeConvWeights( const conv_param_t<2>& conv_p, const std::int8_t* src, diff --git a/src/RefImplementations.h b/src/RefImplementations.h index 082bdf1..a20e348 100644 --- a/src/RefImplementations.h +++ b/src/RefImplementations.h @@ -215,124 +215,4 @@ FBGEMM_API void im2col_ref( std::int32_t A_zero_point, std::uint8_t* Ao); -/* - * @brief Reference implementation of depthwise convolution with a 3x3 filter - * and padding size 1. - */ -FBGEMM_API void depthwise_3x3_pad_1_ref( - int N, - int H, - int W, - int K, - int stride_h, - int stride_w, - std::int32_t A_zero_point, - const std::uint8_t* A, - const std::int8_t* B, - std::int32_t* C); - -/* - * @brief Reference implementation of depthwise convolution with a 3x3 filter - * and padding size 1, followed by requantization. (the same scaling factors and - * zero points for each channel). - */ -FBGEMM_API void depthwise_3x3_pad_1_ref( - int N, - int H, - int W, - int K, - int stride_h, - int stride_w, - std::int32_t A_zero_point, - const std::uint8_t* A, - std::int32_t B_zero_point, - const std::int8_t* B, - float C_multiplier, - std::int32_t C_zero_point, - std::uint8_t* C, - const std::int32_t* col_offsets, - const std::int32_t* bias); - -/* - * @brief Reference implementation of depthwise convolution with a 3x3 filter - * and padding size 1, followed by requantization. (different scaling factors - * and zero points for each channel). - */ -FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1_ref( - int N, - int H, - int W, - int K, - int stride_h, - int stride_w, - std::int32_t A_zero_point, - const std::uint8_t* A, - const std::int32_t* B_zero_point, - const std::int8_t* B, - const float* C_multiplier, - std::int32_t C_zero_point, - std::uint8_t* C, - const std::int32_t* col_offsets, - const std::int32_t* bias); - -/* - * @brief Reference implementation of 3D depthwise convolution with a 3x3x3 - * filter and padding size 1. - */ -FBGEMM_API void depthwise_3x3x3_pad_1_ref( - int N, - int T, - int H, - int W, - int K, - int stride_t, - int stride_h, - int stride_w, - std::int32_t A_zero_point, - const std::uint8_t* A, - const std::int8_t* B, - std::int32_t* C); - -/* - * @brief Reference implementation of 3D depthwise convolution with a 3x3x3 - * filter and padding size 1, followed by requantization. - */ -FBGEMM_API void depthwise_3x3x3_pad_1_ref( - int N, - int T, - int H, - int W, - int K, - int stride_t, - int stride_h, - int stride_w, - std::int32_t A_zero_point, - const std::uint8_t* A, - std::int32_t B_zero_point, - const std::int8_t* B, - float C_multiplier, - std::int32_t C_zero_point, - std::uint8_t* C, - const std::int32_t* col_offsets, - const std::int32_t* bias); - -FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1_ref( - int N, - int T, - int H, - int W, - int K, - int stride_t, - int stride_h, - int stride_w, - std::int32_t A_zero_point, - const std::uint8_t* A, - const std::int32_t* B_zero_point, - const std::int8_t* B, - const float* C_multiplier, - std::int32_t C_zero_point, - std::uint8_t* C, - const std::int32_t* col_offsets, - const std::int32_t* bias); - } // namespace fbgemm diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc index ad74ede..00c39c0 100644 --- a/test/I8DepthwiseTest.cc +++ b/test/I8DepthwiseTest.cc @@ -4,7 +4,6 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ -#include "I8DepthwiseTest.h" #include <cmath> #include <cstdio> @@ -68,6 +67,16 @@ static vector<vector<int>> shapes = { { 1, 8, 4, 4, 1, }, }; + +static vector<vector<int>> shapes_3d = { + // NOTE: clang-format wants to use a different formatting but the current + // formatting should be easier to read. + // N, K, T_in, H_in, W_in, stride + { 1, 32, 16, 28, 28, 1, }, + { 1, 128, 8, 14, 14, 2, }, + { 5, 16, 32, 56, 56, 1, }, + { 1, 8, 4, 4, 4, 1, }, +}; // clang-format on namespace { @@ -107,13 +116,29 @@ TEST_P(FBGemmDepthWiseTest, Test3x3) { int stride_h = shape[4]; int stride_w = stride_h; constexpr int R = 3, S = 3; - constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; + int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2, + PAD_R = (S - 1) / 2; + + conv_param_t<2> conv_p( + N, + K, + K, + {H, W}, + K, + {R, S}, + {stride_h, stride_w}, + {PAD_T, PAD_L, PAD_B, PAD_R}); + int H_OUT = conv_p.OUT_DIM[0]; + int W_OUT = conv_p.OUT_DIM[1]; + + int MDim = N * H_OUT * W_OUT; + int KDim = R * S * K; + int KDimPerGroup = KDim / conv_p.G; aligned_vector<uint8_t> A(N * H * W * K); - aligned_vector<int8_t> B(K * R * S); - aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * K), C(C_ref.size()); + aligned_vector<int8_t> B(KDim); + aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size()); + aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = a_symmetric ? 0 : 43; @@ -121,46 +146,52 @@ TEST_P(FBGemmDepthWiseTest, Test3x3) { randFill<int8_t>(B, -16, 16); int32_t B_zero_point = b_symmetric ? 0 : 5; - depthwise_3x3_pad_1_ref( - N, - H, - W, - K, - stride_h, - stride_w, - A_zero_point, - A.data(), - B.data(), - C_ref.data()); - - int32_t minimum = *min_element(C_ref.begin(), C_ref.end()); - int32_t maximum = *max_element(C_ref.begin(), C_ref.end()); - - float C_multiplier = 255. / (maximum - minimum); + aligned_vector<float> C_multiplier(1); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); + int32_t C_zero_point = 5; aligned_vector<int32_t> col_offsets(K); aligned_vector<int32_t> bias(K); randFill(col_offsets, -100, 100); randFill(bias, -40, 40); - int32_t C_zero_point = 5; - aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); - depthwise_3x3_pad_1_ref( - N, - H, - W, - K, - stride_h, - stride_w, - A_zero_point, - A.data(), - B_zero_point, - B.data(), - C_multiplier, - C_zero_point, - C_uint8_ref.data(), - col_offsets.data(), - bias.data()); + vector<int32_t> row_offsets(MDim); + // im2col to compute row offset later + vector<uint8_t> A_im2col; + if (!b_symmetric) { + A_im2col.resize(MDim * KDim); + im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); + } + + conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data()); + + for (int g = 0; g < conv_p.G; ++g) { + // Compute row offset + if (!b_symmetric) { + row_offsets_u8acc32_ref( + MDim, + KDimPerGroup, + KDim, + A_im2col.data() + g * KDimPerGroup, + row_offsets.data()); + } + + // Requantization + requantize_u8acc32_ref( + MDim, + 1, + conv_p.G, + C_ref.data() + g, + C_uint8_ref.data() + g, + C_multiplier.data(), + C_zero_point, + A_zero_point, + &B_zero_point, + row_offsets.data(), + col_offsets.data() + g, + bias.data() + g, + K); + } Packed3x3ConvMatrix Bp(K, B.data()); @@ -175,7 +206,7 @@ TEST_P(FBGemmDepthWiseTest, Test3x3) { A.data(), B_zero_point, Bp, - C_multiplier, + C_multiplier[0], C_zero_point, C_uint8.data(), a_symmetric ? nullptr : col_offsets.data(), @@ -222,65 +253,81 @@ TEST_P(FBGemmDepthWiseTest, Test3x3x3) { constexpr int K_T = 3, K_H = 3, K_W = 3; constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1; - int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; + + conv_param_t<3> conv_p( + N, + K, + K, + {T, H, W}, + K, + {K_T, K_H, K_W}, + {stride_t, stride_h, stride_w}, + {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R}); + int T_OUT = conv_p.OUT_DIM[0]; + int H_OUT = conv_p.OUT_DIM[1]; + int W_OUT = conv_p.OUT_DIM[2]; + + int MDim = N * T_OUT * H_OUT * W_OUT; + int KDim = K_T * K_H * K_W * K; + int KDimPerGroup = KDim / conv_p.G; aligned_vector<uint8_t> A(N * T * H * W * K); - aligned_vector<int8_t> B(K * K_T * K_H * K_W); - aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K), - C(C_ref.size()); + aligned_vector<int8_t> B(KDim); + aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size()); + aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = a_symmetric ? 0 : 43; randFill<int8_t>(B, -16, 16); - int32_t B_zero_point = 5; - - depthwise_3x3x3_pad_1_ref( - N, - T, - H, - W, - K, - stride_t, - stride_h, - stride_w, - A_zero_point, - A.data(), - B.data(), - C_ref.data()); - - int32_t minimum = *min_element(C_ref.begin(), C_ref.end()); - int32_t maximum = *max_element(C_ref.begin(), C_ref.end()); + int32_t B_zero_point = b_symmetric ? 0 : 5; - float C_multiplier = 255. / (maximum - minimum); + aligned_vector<float> C_multiplier(1); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); + int32_t C_zero_point = 5; aligned_vector<int32_t> col_offsets(K); aligned_vector<int32_t> bias(K); randFill(col_offsets, -100, 100); randFill(bias, -40, 40); - int32_t C_zero_point = 5; - aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); - depthwise_3x3x3_pad_1_ref( - N, - T, - H, - W, - K, - stride_t, - stride_h, - stride_w, - A_zero_point, - A.data(), - B_zero_point, - B.data(), - C_multiplier, - C_zero_point, - C_uint8_ref.data(), - col_offsets.data(), - bias.data()); + vector<int32_t> row_offsets(MDim); + // im2col to compute row offset later + vector<uint8_t> A_im2col; + if (!b_symmetric) { + A_im2col.resize(MDim * KDim); + im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); + } + + conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data()); + + for (int g = 0; g < conv_p.G; ++g) { + // Compute row offset + if (!b_symmetric) { + row_offsets_u8acc32_ref( + MDim, + KDimPerGroup, + KDim, + A_im2col.data() + g * KDimPerGroup, + row_offsets.data()); + } + + // Requantization + requantize_u8acc32_ref( + MDim, + 1, + conv_p.G, + C_ref.data() + g, + C_uint8_ref.data() + g, + C_multiplier.data(), + C_zero_point, + A_zero_point, + &B_zero_point, + row_offsets.data(), + col_offsets.data() + g, + bias.data() + g, + K); + } Packed3x3x3ConvMatrix Bp(K, B.data()); @@ -297,10 +344,10 @@ TEST_P(FBGemmDepthWiseTest, Test3x3x3) { A.data(), B_zero_point, Bp, - C_multiplier, + C_multiplier[0], C_zero_point, C_uint8.data(), - col_offsets.data(), + a_symmetric ? nullptr : col_offsets.data(), bias.data(), false, /* fuse_relu */ 0, @@ -336,14 +383,29 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) { int stride_h = shape[4]; int stride_w = stride_h; constexpr int R = 3, S = 3; - constexpr int PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int H_OUT = (H + PAD_T + PAD_B - R) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; + int PAD_T = (R - 1) / 2, PAD_B = (R - 1) / 2, PAD_L = (S - 1) / 2, + PAD_R = (S - 1) / 2; + + conv_param_t<2> conv_p( + N, + K, + K, + {H, W}, + K, + {R, S}, + {stride_h, stride_w}, + {PAD_T, PAD_L, PAD_B, PAD_R}); + int H_OUT = conv_p.OUT_DIM[0]; + int W_OUT = conv_p.OUT_DIM[1]; + + int MDim = N * H_OUT * W_OUT; + int KDim = R * S * K; + int KDimPerGroup = KDim / conv_p.G; aligned_vector<uint8_t> A(N * H * W * K); - aligned_vector<int8_t> B(K * R * S); - int32_t C_num_rows = N * H_OUT * W_OUT; - aligned_vector<int32_t> C_ref(C_num_rows * K), C(C_ref.size()); + aligned_vector<int8_t> B(KDim); + aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size()); + aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; @@ -359,28 +421,8 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) { B_zero_point[k] = 5 + k; } - depthwise_3x3_pad_1_ref( - N, - H, - W, - K, - stride_h, - stride_w, - A_zero_point, - A.data(), - B.data(), - C_ref.data()); - - aligned_vector<int32_t> C_ref_transpose(C_ref); - transpose_matrix(C_ref.data(), C_num_rows, K); - vector<float> C_multiplier(K); - for (auto k = 0; k < K; ++k) { - auto C_ref_k_begin = C_ref_transpose.begin() + k * C_num_rows; - auto C_ref_k_end = C_ref_k_begin + C_num_rows; - int32_t minimum = *min_element(C_ref_k_begin, C_ref_k_end); - int32_t maximum = *max_element(C_ref_k_begin, C_ref_k_end); - C_multiplier[k] = 255. / (maximum - minimum); - } + aligned_vector<float> C_multiplier(K); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_point = 5; aligned_vector<int32_t> col_offsets(K); @@ -388,23 +430,38 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) { randFill(col_offsets, -100, 100); randFill(bias, -40, 40); - aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); - depthwise_3x3_per_channel_quantization_pad_1_ref( - N, - H, - W, - K, - stride_h, - stride_w, - A_zero_point, - A.data(), - B_zero_point.data(), - B.data(), - C_multiplier.data(), - C_zero_point, - C_uint8_ref.data(), - col_offsets.data(), - bias.data()); + // im2col to compute row offset later + vector<int32_t> row_offsets(MDim); + vector<uint8_t> A_im2col(MDim * KDim); + im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); + + conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data()); + + for (int g = 0; g < conv_p.G; ++g) { + // Compute row offset + row_offsets_u8acc32_ref( + MDim, + KDimPerGroup, + KDim, + A_im2col.data() + g * KDimPerGroup, + row_offsets.data()); + + // Requantization + requantize_u8acc32_ref( + MDim, + 1, + conv_p.G, + C_ref.data() + g, + C_uint8_ref.data() + g, + C_multiplier.data() + g, + C_zero_point, + A_zero_point, + B_zero_point.data() + g, + row_offsets.data(), + col_offsets.data() + g, + bias.data() + g, + K); + } Packed3x3ConvMatrix Bp(K, B.data()); @@ -459,14 +516,28 @@ TEST(FBGemmDepthWiseTest, Test3x3x3PerChannelQuantization) { constexpr int K_T = 3, K_H = 3, K_W = 3; constexpr int PAD_P = 1, PAD_N = 1, PAD_T = 1, PAD_B = 1, PAD_L = 1, PAD_R = 1; - int T_OUT = (T + PAD_P + PAD_N - K_T) / stride_t + 1; - int H_OUT = (H + PAD_T + PAD_B - K_H) / stride_h + 1; - int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; + + conv_param_t<3> conv_p( + N, + K, + K, + {T, H, W}, + K, + {K_T, K_H, K_W}, + {stride_t, stride_h, stride_w}, + {PAD_P, PAD_T, PAD_L, PAD_N, PAD_B, PAD_R}); + int T_OUT = conv_p.OUT_DIM[0]; + int H_OUT = conv_p.OUT_DIM[1]; + int W_OUT = conv_p.OUT_DIM[2]; + + int MDim = N * T_OUT * H_OUT * W_OUT; + int KDim = K_T * K_H * K_W * K; + int KDimPerGroup = KDim / conv_p.G; aligned_vector<uint8_t> A(N * T * H * W * K); - aligned_vector<int8_t> B(K * K_T * K_H * K_W); - int32_t C_num_rows = N * T_OUT * H_OUT * W_OUT; - aligned_vector<int32_t> C_ref(C_num_rows * K), C(C_ref.size()); + aligned_vector<int8_t> B(KDim); + aligned_vector<int32_t> C_ref(MDim * K), C(C_ref.size()); + aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; @@ -482,30 +553,8 @@ TEST(FBGemmDepthWiseTest, Test3x3x3PerChannelQuantization) { B_zero_point[k] = 5 + k; } - depthwise_3x3x3_pad_1_ref( - N, - T, - H, - W, - K, - stride_t, - stride_h, - stride_w, - A_zero_point, - A.data(), - B.data(), - C_ref.data()); - - aligned_vector<int32_t> C_ref_transpose(C_ref); - transpose_matrix(C_ref.data(), C_num_rows, K); - vector<float> C_multiplier(K); - for (auto k = 0; k < K; ++k) { - auto C_ref_k_begin = C_ref_transpose.begin() + k * C_num_rows; - auto C_ref_k_end = C_ref_k_begin + C_num_rows; - int32_t minimum = *min_element(C_ref_k_begin, C_ref_k_end); - int32_t maximum = *max_element(C_ref_k_begin, C_ref_k_end); - C_multiplier[k] = 255. / (maximum - minimum); - } + aligned_vector<float> C_multiplier(K); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_point = 5; aligned_vector<int32_t> col_offsets(K); @@ -513,25 +562,38 @@ TEST(FBGemmDepthWiseTest, Test3x3x3PerChannelQuantization) { randFill(col_offsets, -100, 100); randFill(bias, -40, 40); - aligned_vector<uint8_t> C_uint8_ref(C_ref.size()), C_uint8(C_ref.size()); - depthwise_3x3x3_per_channel_quantization_pad_1_ref( - N, - T, - H, - W, - K, - stride_t, - stride_h, - stride_w, - A_zero_point, - A.data(), - B_zero_point.data(), - B.data(), - C_multiplier.data(), - C_zero_point, - C_uint8_ref.data(), - col_offsets.data(), - bias.data()); + vector<int32_t> row_offsets(MDim); + // im2col to compute row offset later + vector<uint8_t> A_im2col(MDim * KDim); + im2col_ref(conv_p, A.data(), A_zero_point, A_im2col.data()); + + conv_ref(conv_p, A.data(), A_zero_point, B.data(), C_ref.data()); + + for (int g = 0; g < conv_p.G; ++g) { + // Compute row offset + row_offsets_u8acc32_ref( + MDim, + KDimPerGroup, + KDim, + A_im2col.data() + g * KDimPerGroup, + row_offsets.data()); + + // Requantization + requantize_u8acc32_ref( + MDim, + 1, + conv_p.G, + C_ref.data() + g, + C_uint8_ref.data() + g, + C_multiplier.data() + g, + C_zero_point, + A_zero_point, + B_zero_point.data() + g, + row_offsets.data(), + col_offsets.data() + g, + bias.data() + g, + K); + } Packed3x3x3ConvMatrix Bp(K, B.data()); diff --git a/test/I8DepthwiseTest.h b/test/I8DepthwiseTest.h deleted file mode 100644 index cc8da9d..0000000 --- a/test/I8DepthwiseTest.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -#pragma once - -#include <vector> - -namespace fbgemm { - -// From ResNeXt-3D-101 -// clang-format off -static std::vector<std::vector<int>> shapes_3d = { - // NOTE: clang-format wants to use a different formatting but the current - // formatting should be easier to read. - // N, K, T_in, H_in, W_in, stride - { 1, 64, 32, 56, 56, 1, }, - { 1, 128, 16, 28, 28, 1, }, - { 1, 256, 8, 14, 14, 1, }, - { 1, 512, 4, 7, 7, 1, }, - - { 1, 128, 32, 56, 56, 2, }, - { 1, 256, 16, 28, 28, 2, }, - { 1, 512, 8, 14, 14, 2, }, - - { 5, 64, 32, 56, 56, 1, }, - { 5, 128, 16, 28, 28, 1, }, - { 5, 256, 8, 14, 14, 1, }, - { 5, 512, 4, 7, 7, 1, }, - - { 5, 128, 32, 56, 56, 2, }, - { 5, 256, 16, 28, 28, 2, }, - { 5, 512, 8, 14, 14, 2, }, - - { 1, 8, 4, 4, 4, 1, }, -}; -// clang-format on - -} // namespace fbgemm |