diff options
author | Jianyu Huang <jianyuhuang@fb.com> | 2018-12-04 20:16:31 +0300 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-12-04 20:21:13 +0300 |
commit | 04b2e99cade50f8978a39792404d57b7c654dc79 (patch) | |
tree | c63c7ecea7b7c740865171ee928f2d01b400c53b /bench | |
parent | 4fcc1f1f4ac3ea1fa0d7eb92bcf6a6735ee5e818 (diff) |
Fix the group issue in the benchmark and use ResNext101 conv shapes (#32)
Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/32
- Fix the group convolution issues in the benchmark
- Add the convolution shapes in ResNext101
Result:
(The following results are tested and collected on my devserver.)
ResNext101: int16 Accumulation: batch_size:1; remove 1x1 convolutions:
P60395456
ResNext101: int32 Accumulation: batch_size:1; remove 1x1 convolutions:
P60395457
ResNext101: int16 Accumulation: batch_size:1
P60394563
ResNext101: int32 Accumulation: batch_size:1
P60394565
ResNext101: int16 Accumulation: batch_size:50
P60394548
ResNext101: int32 Accumulation: batch_size:50
P60394552
Xray OCR: int16 Accumulation:
P60394527
Xray OCR: int32 Accumulation:
P60394534
Reviewed By: jspark1105
Differential Revision: D13286215
fbshipit-source-id: e78b691999006c25e92a746783b8bd1b87703a38
Diffstat (limited to 'bench')
-rw-r--r-- | bench/Im2ColFusedRequantizeAcc16Benchmark.cc | 362 | ||||
-rw-r--r-- | bench/Im2ColFusedRequantizeBenchmark.cc (renamed from bench/Im2ColFusedRequantizeAcc32Benchmark.cc) | 107 |
2 files changed, 51 insertions, 418 deletions
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc deleted file mode 100644 index cb2edf5..0000000 --- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -#include <algorithm> -#include <chrono> -#include <cmath> -#include <iomanip> -#include <iostream> -#include <random> -#include <vector> - -#ifdef _OPENMP -#include <omp.h> -#endif - -#include "BenchUtils.h" -#include "fbgemm/Fbgemm.h" -#include "src/RefImplementations.h" - -using namespace std; -using namespace fbgemm; - -void performance_test() { - vector<conv_param_t<>> shapes = { - // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w - conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}), - conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}), - conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - }; - - bool flush = true; - std::vector<char> llc; - - if (flush) { - llc.resize(128 * 1024 * 1024, 1.0); - } - - constexpr int NWARMUP = 4; - constexpr int NITER = 10; - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - cout << "WARNING: the timer may be inaccurate when used by multiple threads." - << endl; - cout << "MB, " - << "IC, " - << "OC, " - << "IH, " - << "IW, " - << "KH, " - << "KW, " - << "stride_h, " - << "stride_w, " - << "pad_h, " - << "pad_w, " - << "Type, " - << "M, " - << "N, " - << "K, " - << "Im2Col (ms), " - << "Packing (ms), " - << "Kernel (ms), " - << "Postprocessing (ms), " - << "fbgemmPacked (ms), " - << "Total (ms), " - << "GOPS" << endl; -#else - cout << setw(8) << "MB, " - << "IC, " - << "OC, " - << "IH, " - << "IW, " - << "KH, " - << "KW, " - << "stride_h, " - << "stride_w, " - << "pad_h, " - << "pad_w, " - << "Type, " - << "M, " - << "N, " - << "K, " << setw(5) << "GOPS" << endl; -#endif - - chrono::time_point<chrono::high_resolution_clock> begin, end; - for (auto conv_p : shapes) { - aligned_vector<uint8_t> Aint8( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC); - aligned_vector<uint8_t> Aint8_out( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] * - conv_p.K[1] * conv_p.IC); - - aligned_vector<int8_t> Bint8( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC); - - aligned_vector<int32_t> Cint32_ref( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC); - aligned_vector<int32_t> Cint32_fb(Cint32_ref.size()); - aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size()); - - // A matrix (input activations) - randFill<uint8_t>(Aint8, 0, 5); - int32_t Aint8_zero_point = 4; - aligned_vector<float> Afp32(Aint8.begin(), Aint8.end()); - - // B matrix (weights) - randFill<int8_t>(Bint8, -4, 4); - // int32_t Bint8_zero_point = -3; - aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end()); - - // reference implementation - conv_ref( - conv_p, - Aint8.data(), - Aint8_zero_point, - Bint8.data(), - Cint32_ref.data()); - - // matrix dimensions after im2col - int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1]; - int NDim = conv_p.OC; - int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC; - - // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), KDim, NDim, NDim, - // "B unpacked"); - // packedB.printPackedMatrix("B Packed"); - - double nops = 2.0 * static_cast<double>(NITER) * MDim * NDim * KDim; - double ttot = 0.0; - string runType; - - vector<int32_t> row_offset_buf( - PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize()); - - PackAWithIm2Col<uint8_t, int16_t> packA( - conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data()); - - PackBMatrix<int8_t, int16_t> packedB( - matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); - - // no-op output process objects - DoNothing<int32_t, int32_t> doNothing32BitObj; - memCopy<> memcopyObj(doNothing32BitObj); - - runType = "FusedIm2Col"; - ttot = 0; -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - double im2col_time = 0.0; - double total_im2col_time = 0.0; - double total_packing_time = 0.0; - double total_computing_time = 0.0; - double total_kernel_time = 0.0; - double total_postprocessing_time = 0.0; - double total_run_time = 0.0; -#endif - for (auto i = 0; i < NWARMUP + NITER; ++i) { -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - packing_time = 0.0; - computing_time = 0.0; - kernel_time = 0.0; - postprocessing_time = 0.0; - run_time = 0.0; -#endif - llc_flush(llc); - begin = chrono::high_resolution_clock::now(); - fbgemmPacked( - packA, - packedB, - Cint32_fb.data(), - Cint32_fb.data(), - NDim, - memcopyObj, - 0, - 1); - end = chrono::high_resolution_clock::now(); - - if (i >= NWARMUP) { - auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin); - ttot += dur.count(); -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - total_packing_time += packing_time; - total_computing_time += computing_time; - total_kernel_time += kernel_time; - total_postprocessing_time += postprocessing_time; - total_run_time += run_time; -#endif - } - } - - cout << setw(4) << conv_p.MB << ", " << conv_p.IC << ", " << conv_p.OC - << ", " << conv_p.IN_DIM[0] << ", " << conv_p.IN_DIM[1] << ", " - << conv_p.G << ", " << conv_p.K[0] << ", " << conv_p.K[1] << ", " - << conv_p.stride[0] << ", " << conv_p.stride[1] << ", " - << conv_p.pad[0] << ", " << conv_p.pad[1] << ", "; - - cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5) - << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6) - << KDim << ", "; -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - cout << fixed << setprecision(6) << setw(8) << 0 << ", " - << total_packing_time / (double)NITER / 1e6 << ", " - << total_kernel_time / (double)NITER / 1e6 << ", " - << total_postprocessing_time / (double)NITER / 1e6 << ", " - << total_run_time / (double)NITER / 1e6 << ", " - << ttot / (double)NITER / 1e6 << ", "; -#endif - cout << setprecision(2) << nops / ttot << endl; - - compare_buffers(Cint32_ref.data(), Cint32_fb.data(), MDim, NDim, NDim, 5); - - runType = "UnfusedIm2Col"; - row_offset_buf.resize( - PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); - ttot = 0; -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - total_im2col_time = 0.0; - total_packing_time = 0.0; - total_computing_time = 0.0; - total_kernel_time = 0.0; - total_postprocessing_time = 0.0; - total_run_time = 0.0; -#endif - for (auto i = 0; i < NWARMUP + NITER; ++i) { -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - im2col_time = 0.0; - packing_time = 0.0; - computing_time = 0.0; - kernel_time = 0.0; - postprocessing_time = 0.0; - run_time = 0.0; -#endif - llc_flush(llc); - begin = chrono::high_resolution_clock::now(); - - im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_out.data()); - -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - end = chrono::high_resolution_clock::now(); - im2col_time = - chrono::duration_cast<chrono::nanoseconds>(end - begin).count(); -#endif - - // printMatrix(matrix_op_t::NoTranspose, Aint8_out.data(), MDim, KDim, - // KDim, "A_out after im2col unpacked"); - - PackAWithRowOffset<uint8_t, int16_t> packAN( - matrix_op_t::NoTranspose, - MDim, - KDim, - Aint8_out.data(), - KDim, - nullptr, - 1, - row_offset_buf.data()); - - fbgemmPacked( - packAN, - packedB, - Cint32_fb2.data(), - Cint32_fb2.data(), - NDim, - memcopyObj, - 0, - 1); - end = chrono::high_resolution_clock::now(); - - if (i >= NWARMUP) { - auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin); - ttot += dur.count(); -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - total_im2col_time += im2col_time; - total_packing_time += packing_time; - total_computing_time += computing_time; - total_kernel_time += kernel_time; - total_postprocessing_time += postprocessing_time; - total_run_time += run_time; -#endif - } - } - - ((volatile char*)(llc.data())); - - // packedB.printPackedMatrix("bench B Packed"); - // printMatrix(matrix_op_t::NoTranspose, Cint32_fb.data(), MDim, NDim, NDim, - // "C fb fp32"); - // printMatrix(matrix_op_t::NoTranspose, Cint32_fb2.data(), - // MDim, NDim, NDim, "C fb2 fp32"); - // printMatrix(matrix_op_t::NoTranspose, - // Cint32_ref.data(), MDim, NDim, NDim, "C ref fp32"); - - cout << setw(4) << conv_p.MB << ", " << conv_p.IC << ", " << conv_p.OC - << ", " << conv_p.IN_DIM[0] << ", " << conv_p.IN_DIM[1] << ", " - << conv_p.G << ", " << conv_p.K[0] << ", " << conv_p.K[1] << ", " - << conv_p.stride[0] << ", " << conv_p.stride[1] << ", " - << conv_p.pad[0] << ", " << conv_p.pad[1] << ", "; - - cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5) - << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6) - << KDim << ", "; -#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN - cout << fixed << setprecision(6) << setw(8) - << total_im2col_time / (double)NITER / 1e6 << ", " - << total_packing_time / (double)NITER / 1e6 << ", " - << total_kernel_time / (double)NITER / 1e6 << ", " - << total_postprocessing_time / (double)NITER / 1e6 << ", " - << total_run_time / (double)NITER / 1e6 << ", " - << ttot / (double)NITER / 1e6 << ", "; -#endif - cout << setprecision(2) << nops / ttot << endl; - - compare_buffers(Cint32_ref.data(), Cint32_fb2.data(), MDim, NDim, NDim, 5); - } // shapes -} - -int main() { -#ifdef _OPENMP - omp_set_num_threads(1); -#endif - performance_test(); - return 0; -} diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeBenchmark.cc index 8e112d8..d564ad4 100644 --- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc +++ b/bench/Im2ColFusedRequantizeBenchmark.cc @@ -23,51 +23,37 @@ using namespace std; using namespace fbgemm; +template <typename Acc_t> void performance_test() { vector<conv_param_t<>> shapes = { - // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w - conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}), - conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}), - conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), - conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, + // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right + // ResNext 101 + // Batch size = 1 + conv_param_t<>(1, 3, 64, {224, 224}, 1, {7, 7}, {2, 2}, {3, 3, 3, 3}), + conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 256, 256, {56, 56}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 256, 256, {28, 28}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 512, 512, {28, 28}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 1024, 1024, {14, 14}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // Batch size = 50 + conv_param_t<>(50, 3, 64, {224, 224}, 1, {7, 7}, {2, 2}, {3, 3, 3, 3}), + conv_param_t<>(50, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(50, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(50, 256, 256, {56, 56}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(50, 256, 256, {28, 28}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(50, 512, 512, {28, 28}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(50, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(50, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>( + 50, 1024, 1024, {14, 14}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(50, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(50, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), }; bool flush = true; @@ -88,6 +74,7 @@ void performance_test() { << "OC, " << "IH, " << "IW, " + << "G, " << "KH, " << "KW, " << "stride_h, " @@ -111,6 +98,7 @@ void performance_test() { << "OC, " << "IH, " << "IW, " + << "G, " << "KH, " << "KW, " << "stride_h, " @@ -125,6 +113,9 @@ void performance_test() { chrono::time_point<chrono::high_resolution_clock> begin, end; for (auto conv_p : shapes) { + if (conv_p.IC % conv_p.G != 0 || conv_p.OC % conv_p.G != 0) { + continue; + } aligned_vector<uint8_t> Aint8( conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC); @@ -140,12 +131,9 @@ void performance_test() { aligned_vector<int32_t> Cint32_fb(Cint32_ref.size()); aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size()); - // cout << conv_p.toString() << endl; - // A matrix (input activations) randFill<uint8_t>(Aint8, 0, 5); int32_t Aint8_zero_point = 4; - aligned_vector<float> Apf32(Aint8.begin(), Aint8.end()); // B matrix (weights) randFill<int8_t>(Bint8, -4, 4); @@ -162,7 +150,7 @@ void performance_test() { // matrix dimensions after im2col int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1]; - int NDim = conv_p.OC; + int NDim = conv_p.OC / conv_p.G; int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC; // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), KDim, NDim, NDim, @@ -174,13 +162,19 @@ void performance_test() { string runType; vector<int32_t> row_offset_buf( - PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize()); + PackAWithIm2Col<uint8_t, Acc_t>::rowOffsetBufferSize()); - PackAWithIm2Col<uint8_t, int32_t> packA( + PackAWithIm2Col<uint8_t, Acc_t> packA( conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data()); - PackBMatrix<int8_t, int32_t> packedB( - matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); + PackBMatrix<int8_t, Acc_t> packedB( + matrix_op_t::NoTranspose, + KDim, + NDim, + Bint8.data(), + NDim, + nullptr, + conv_p.G); // no-op output process objects DoNothing<int32_t, int32_t> doNothing32BitObj; @@ -212,7 +206,7 @@ void performance_test() { packedB, Cint32_fb.data(), Cint32_fb.data(), - NDim, + conv_p.G * NDim, memcopyObj, 0, 1); @@ -285,14 +279,14 @@ void performance_test() { // printMatrix(matrix_op_t::NoTranspose, Aint8_out.data(), MDim, KDim, // KDim, "A_out after im2col unpacked"); - PackAWithRowOffset<uint8_t, int32_t> packAN( + PackAWithRowOffset<uint8_t, Acc_t> packAN( matrix_op_t::NoTranspose, MDim, KDim, Aint8_out.data(), KDim, nullptr, - 1, + conv_p.G, row_offset_buf.data()); fbgemmPacked( @@ -300,7 +294,7 @@ void performance_test() { packedB, Cint32_fb2.data(), Cint32_fb2.data(), - NDim, + conv_p.G * NDim, memcopyObj, 0, 1); @@ -358,6 +352,7 @@ int main() { #ifdef _OPENMP omp_set_num_threads(1); #endif - performance_test(); + performance_test<int16_t>(); + performance_test<int32_t>(); return 0; } |