Fix the group issue in the benchmark and use ResNext101 conv shapes (#32)

Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/32 - Fix the group convolution issues in the benchmark - Add the convolution shapes in ResNext101 Result: (The following results are tested and collected on my devserver.) ResNext101: int16 Accumulation: batch_size:1; remove 1x1 convolutions: P60395456 ResNext101: int32 Accumulation: batch_size:1; remove 1x1 convolutions: P60395457 ResNext101: int16 Accumulation: batch_size:1 P60394563 ResNext101: int32 Accumulation: batch_size:1 P60394565 ResNext101: int16 Accumulation: batch_size:50 P60394548 ResNext101: int32 Accumulation: batch_size:50 P60394552 Xray OCR: int16 Accumulation: P60394527 Xray OCR: int32 Accumulation: P60394534 Reviewed By: jspark1105 Differential Revision: D13286215 fbshipit-source-id: e78b691999006c25e92a746783b8bd1b87703a38
author: Jianyu Huang <jianyuhuang@fb.com> 2018-12-04 20:16:31 +0300
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2018-12-04 20:21:13 +0300
commit: 04b2e99cade50f8978a39792404d57b7c654dc79 (patch)
tree: c63c7ecea7b7c740865171ee928f2d01b400c53b /bench
parent: 4fcc1f1f4ac3ea1fa0d7eb92bcf6a6735ee5e818 (diff)
2 files changed, 51 insertions, 418 deletions
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
deleted file mode 100644
index cb2edf5..0000000
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <iomanip>
-#include <iostream>
-#include <random>
-#include <vector>
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "BenchUtils.h"
-#include "fbgemm/Fbgemm.h"
-#include "src/RefImplementations.h"
-
-using namespace std;
-using namespace fbgemm;
-
-void performance_test() {
-  vector<conv_param_t<>> shapes = {
-      // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w
-      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
-      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
-      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-  };
-
-  bool flush = true;
-  std::vector<char> llc;
-
-  if (flush) {
-    llc.resize(128 * 1024 * 1024, 1.0);
-  }
-
-  constexpr int NWARMUP = 4;
-  constexpr int NITER = 10;
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-  cout << "WARNING: the timer may be inaccurate when used by multiple threads."
-       << endl;
-  cout << "MB, "
-       << "IC, "
-       << "OC, "
-       << "IH, "
-       << "IW, "
-       << "KH, "
-       << "KW, "
-       << "stride_h, "
-       << "stride_w, "
-       << "pad_h, "
-       << "pad_w, "
-       << "Type, "
-       << "M, "
-       << "N, "
-       << "K, "
-       << "Im2Col (ms), "
-       << "Packing (ms), "
-       << "Kernel (ms), "
-       << "Postprocessing (ms), "
-       << "fbgemmPacked (ms), "
-       << "Total (ms), "
-       << "GOPS" << endl;
-#else
-  cout << setw(8) << "MB, "
-       << "IC, "
-       << "OC, "
-       << "IH, "
-       << "IW, "
-       << "KH, "
-       << "KW, "
-       << "stride_h, "
-       << "stride_w, "
-       << "pad_h, "
-       << "pad_w, "
-       << "Type, "
-       << "M, "
-       << "N, "
-       << "K, " << setw(5) << "GOPS" << endl;
-#endif
-
-  chrono::time_point<chrono::high_resolution_clock> begin, end;
-  for (auto conv_p : shapes) {
-    aligned_vector<uint8_t> Aint8(
-        conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
-    aligned_vector<uint8_t> Aint8_out(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
-        conv_p.K[1] * conv_p.IC);
-
-    aligned_vector<int8_t> Bint8(
-        conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
-
-    aligned_vector<int32_t> Cint32_ref(
-        conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
-    aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
-    aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
-
-    // A matrix (input activations)
-    randFill<uint8_t>(Aint8, 0, 5);
-    int32_t Aint8_zero_point = 4;
-    aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
-
-    // B matrix (weights)
-    randFill<int8_t>(Bint8, -4, 4);
-    // int32_t Bint8_zero_point = -3;
-    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
-
-    // reference implementation
-    conv_ref(
-        conv_p,
-        Aint8.data(),
-        Aint8_zero_point,
-        Bint8.data(),
-        Cint32_ref.data());
-
-    // matrix dimensions after im2col
-    int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
-    int NDim = conv_p.OC;
-    int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
-
-    // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), KDim, NDim, NDim,
-    // "B unpacked");
-    // packedB.printPackedMatrix("B Packed");
-
-    double nops = 2.0 * static_cast<double>(NITER) * MDim * NDim * KDim;
-    double ttot = 0.0;
-    string runType;
-
-    vector<int32_t> row_offset_buf(
-        PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
-
-    PackAWithIm2Col<uint8_t, int16_t> packA(
-        conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
-
-    PackBMatrix<int8_t, int16_t> packedB(
-        matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
-
-    // no-op output process objects
-    DoNothing<int32_t, int32_t> doNothing32BitObj;
-    memCopy<> memcopyObj(doNothing32BitObj);
-
-    runType = "FusedIm2Col";
-    ttot = 0;
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-    double im2col_time = 0.0;
-    double total_im2col_time = 0.0;
-    double total_packing_time = 0.0;
-    double total_computing_time = 0.0;
-    double total_kernel_time = 0.0;
-    double total_postprocessing_time = 0.0;
-    double total_run_time = 0.0;
-#endif
-    for (auto i = 0; i < NWARMUP + NITER; ++i) {
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-      packing_time = 0.0;
-      computing_time = 0.0;
-      kernel_time = 0.0;
-      postprocessing_time = 0.0;
-      run_time = 0.0;
-#endif
-      llc_flush(llc);
-      begin = chrono::high_resolution_clock::now();
-      fbgemmPacked(
-          packA,
-          packedB,
-          Cint32_fb.data(),
-          Cint32_fb.data(),
-          NDim,
-          memcopyObj,
-          0,
-          1);
-      end = chrono::high_resolution_clock::now();
-
-      if (i >= NWARMUP) {
-        auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin);
-        ttot += dur.count();
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-        total_packing_time += packing_time;
-        total_computing_time += computing_time;
-        total_kernel_time += kernel_time;
-        total_postprocessing_time += postprocessing_time;
-        total_run_time += run_time;
-#endif
-      }
-    }
-
-    cout << setw(4) << conv_p.MB << ", " << conv_p.IC << ", " << conv_p.OC
-         << ", " << conv_p.IN_DIM[0] << ", " << conv_p.IN_DIM[1] << ", "
-         << conv_p.G << ", " << conv_p.K[0] << ", " << conv_p.K[1] << ", "
-         << conv_p.stride[0] << ", " << conv_p.stride[1] << ", "
-         << conv_p.pad[0] << ", " << conv_p.pad[1] << ", ";
-
-    cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5)
-         << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6)
-         << KDim << ", ";
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-    cout << fixed << setprecision(6) << setw(8) << 0 << ", "
-         << total_packing_time / (double)NITER / 1e6 << ", "
-         << total_kernel_time / (double)NITER / 1e6 << ", "
-         << total_postprocessing_time / (double)NITER / 1e6 << ", "
-         << total_run_time / (double)NITER / 1e6 << ", "
-         << ttot / (double)NITER / 1e6 << ", ";
-#endif
-    cout << setprecision(2) << nops / ttot << endl;
-
-    compare_buffers(Cint32_ref.data(), Cint32_fb.data(), MDim, NDim, NDim, 5);
-
-    runType = "UnfusedIm2Col";
-    row_offset_buf.resize(
-        PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
-    ttot = 0;
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-    total_im2col_time = 0.0;
-    total_packing_time = 0.0;
-    total_computing_time = 0.0;
-    total_kernel_time = 0.0;
-    total_postprocessing_time = 0.0;
-    total_run_time = 0.0;
-#endif
-    for (auto i = 0; i < NWARMUP + NITER; ++i) {
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-      im2col_time = 0.0;
-      packing_time = 0.0;
-      computing_time = 0.0;
-      kernel_time = 0.0;
-      postprocessing_time = 0.0;
-      run_time = 0.0;
-#endif
-      llc_flush(llc);
-      begin = chrono::high_resolution_clock::now();
-
-      im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_out.data());
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-      end = chrono::high_resolution_clock::now();
-      im2col_time =
-          chrono::duration_cast<chrono::nanoseconds>(end - begin).count();
-#endif
-
-      // printMatrix(matrix_op_t::NoTranspose, Aint8_out.data(), MDim, KDim,
-      // KDim, "A_out after im2col unpacked");
-
-      PackAWithRowOffset<uint8_t, int16_t> packAN(
-          matrix_op_t::NoTranspose,
-          MDim,
-          KDim,
-          Aint8_out.data(),
-          KDim,
-          nullptr,
-          1,
-          row_offset_buf.data());
-
-      fbgemmPacked(
-          packAN,
-          packedB,
-          Cint32_fb2.data(),
-          Cint32_fb2.data(),
-          NDim,
-          memcopyObj,
-          0,
-          1);
-      end = chrono::high_resolution_clock::now();
-
-      if (i >= NWARMUP) {
-        auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin);
-        ttot += dur.count();
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-        total_im2col_time += im2col_time;
-        total_packing_time += packing_time;
-        total_computing_time += computing_time;
-        total_kernel_time += kernel_time;
-        total_postprocessing_time += postprocessing_time;
-        total_run_time += run_time;
-#endif
-      }
-    }
-
-    ((volatile char*)(llc.data()));
-
-    // packedB.printPackedMatrix("bench B Packed");
-    // printMatrix(matrix_op_t::NoTranspose, Cint32_fb.data(), MDim, NDim, NDim,
-    // "C fb fp32");
-    // printMatrix(matrix_op_t::NoTranspose, Cint32_fb2.data(),
-    // MDim, NDim, NDim, "C fb2 fp32");
-    // printMatrix(matrix_op_t::NoTranspose,
-    // Cint32_ref.data(), MDim, NDim, NDim, "C ref fp32");
-
-    cout << setw(4) << conv_p.MB << ", " << conv_p.IC << ", " << conv_p.OC
-         << ", " << conv_p.IN_DIM[0] << ", " << conv_p.IN_DIM[1] << ", "
-         << conv_p.G << ", " << conv_p.K[0] << ", " << conv_p.K[1] << ", "
-         << conv_p.stride[0] << ", " << conv_p.stride[1] << ", "
-         << conv_p.pad[0] << ", " << conv_p.pad[1] << ", ";
-
-    cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5)
-         << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6)
-         << KDim << ", ";
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
-    cout << fixed << setprecision(6) << setw(8)
-         << total_im2col_time / (double)NITER / 1e6 << ", "
-         << total_packing_time / (double)NITER / 1e6 << ", "
-         << total_kernel_time / (double)NITER / 1e6 << ", "
-         << total_postprocessing_time / (double)NITER / 1e6 << ", "
-         << total_run_time / (double)NITER / 1e6 << ", "
-         << ttot / (double)NITER / 1e6 << ", ";
-#endif
-    cout << setprecision(2) << nops / ttot << endl;
-
-    compare_buffers(Cint32_ref.data(), Cint32_fb2.data(), MDim, NDim, NDim, 5);
-  } // shapes
-}
-
-int main() {
-#ifdef _OPENMP
-  omp_set_num_threads(1);
-#endif
-  performance_test();
-  return 0;
-}
diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeBenchmark.cc
index 8e112d8..d564ad4 100644
--- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeBenchmark.cc
@@ -23,51 +23,37 @@
 using namespace std;
 using namespace fbgemm;
 
+template <typename Acc_t>
 void performance_test() {
   vector<conv_param_t<>> shapes = {
-      // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w
-      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
-      conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
-      conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
-      conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
+      // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right
+      // ResNext 101
+      // Batch size = 1
+      conv_param_t<>(1, 3, 64, {224, 224}, 1, {7, 7}, {2, 2}, {3, 3, 3, 3}),
+      conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 256, 256, {56, 56}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 256, 256, {28, 28}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 512, 512, {28, 28}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 1024, 1024, {14, 14}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(1, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      // Batch size = 50
+      conv_param_t<>(50, 3, 64, {224, 224}, 1, {7, 7}, {2, 2}, {3, 3, 3, 3}),
+      conv_param_t<>(50, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(50, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(50, 256, 256, {56, 56}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(50, 256, 256, {28, 28}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(50, 512, 512, {28, 28}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(50, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(50, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(
+          50, 1024, 1024, {14, 14}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+      conv_param_t<>(50, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+      conv_param_t<>(50, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
   };
 
   bool flush = true;
@@ -88,6 +74,7 @@ void performance_test() {
        << "OC, "
        << "IH, "
        << "IW, "
+       << "G, "
        << "KH, "
        << "KW, "
        << "stride_h, "
@@ -111,6 +98,7 @@ void performance_test() {
        << "OC, "
        << "IH, "
        << "IW, "
+       << "G, "
        << "KH, "
        << "KW, "
        << "stride_h, "
@@ -125,6 +113,9 @@ void performance_test() {
 
   chrono::time_point<chrono::high_resolution_clock> begin, end;
   for (auto conv_p : shapes) {
+    if (conv_p.IC % conv_p.G != 0 || conv_p.OC % conv_p.G != 0) {
+      continue;
+    }
     aligned_vector<uint8_t> Aint8(
         conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
 
@@ -140,12 +131,9 @@ void performance_test() {
     aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
     aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
 
-    // cout << conv_p.toString() << endl;
-
     // A matrix (input activations)
     randFill<uint8_t>(Aint8, 0, 5);
     int32_t Aint8_zero_point = 4;
-    aligned_vector<float> Apf32(Aint8.begin(), Aint8.end());
 
     // B matrix (weights)
     randFill<int8_t>(Bint8, -4, 4);
@@ -162,7 +150,7 @@ void performance_test() {
 
     // matrix dimensions after im2col
     int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
-    int NDim = conv_p.OC;
+    int NDim = conv_p.OC / conv_p.G;
     int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
 
     // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), KDim, NDim, NDim,
@@ -174,13 +162,19 @@ void performance_test() {
     string runType;
 
     vector<int32_t> row_offset_buf(
-        PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
+        PackAWithIm2Col<uint8_t, Acc_t>::rowOffsetBufferSize());
 
-    PackAWithIm2Col<uint8_t, int32_t> packA(
+    PackAWithIm2Col<uint8_t, Acc_t> packA(
         conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
 
-    PackBMatrix<int8_t, int32_t> packedB(
-        matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
+    PackBMatrix<int8_t, Acc_t> packedB(
+        matrix_op_t::NoTranspose,
+        KDim,
+        NDim,
+        Bint8.data(),
+        NDim,
+        nullptr,
+        conv_p.G);
 
     // no-op output process objects
     DoNothing<int32_t, int32_t> doNothing32BitObj;
@@ -212,7 +206,7 @@ void performance_test() {
           packedB,
           Cint32_fb.data(),
           Cint32_fb.data(),
-          NDim,
+          conv_p.G * NDim,
           memcopyObj,
           0,
           1);
@@ -285,14 +279,14 @@ void performance_test() {
       // printMatrix(matrix_op_t::NoTranspose, Aint8_out.data(), MDim, KDim,
       // KDim, "A_out after im2col unpacked");
 
-      PackAWithRowOffset<uint8_t, int32_t> packAN(
+      PackAWithRowOffset<uint8_t, Acc_t> packAN(
           matrix_op_t::NoTranspose,
           MDim,
           KDim,
           Aint8_out.data(),
           KDim,
           nullptr,
-          1,
+          conv_p.G,
           row_offset_buf.data());
 
       fbgemmPacked(
@@ -300,7 +294,7 @@ void performance_test() {
           packedB,
           Cint32_fb2.data(),
           Cint32_fb2.data(),
-          NDim,
+          conv_p.G * NDim,
           memcopyObj,
           0,
           1);
@@ -358,6 +352,7 @@ int main() {
 #ifdef _OPENMP
   omp_set_num_threads(1);
 #endif
-  performance_test();
+  performance_test<int16_t>();
+  performance_test<int32_t>();
   return 0;
 }
author	Jianyu Huang <jianyuhuang@fb.com>	2018-12-04 20:16:31 +0300
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2018-12-04 20:21:13 +0300
commit	04b2e99cade50f8978a39792404d57b7c654dc79 (patch)
tree	c63c7ecea7b7c740865171ee928f2d01b400c53b /bench
parent	4fcc1f1f4ac3ea1fa0d7eb92bcf6a6735ee5e818 (diff)