Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/bench
diff options
context:
space:
mode:
authorJianyu Huang <jianyuhuang@fb.com>2018-12-04 20:16:31 +0300
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2018-12-04 20:21:13 +0300
commit04b2e99cade50f8978a39792404d57b7c654dc79 (patch)
treec63c7ecea7b7c740865171ee928f2d01b400c53b /bench
parent4fcc1f1f4ac3ea1fa0d7eb92bcf6a6735ee5e818 (diff)
Fix the group issue in the benchmark and use ResNext101 conv shapes (#32)
Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/32 - Fix the group convolution issues in the benchmark - Add the convolution shapes in ResNext101 Result: (The following results are tested and collected on my devserver.) ResNext101: int16 Accumulation: batch_size:1; remove 1x1 convolutions: P60395456 ResNext101: int32 Accumulation: batch_size:1; remove 1x1 convolutions: P60395457 ResNext101: int16 Accumulation: batch_size:1 P60394563 ResNext101: int32 Accumulation: batch_size:1 P60394565 ResNext101: int16 Accumulation: batch_size:50 P60394548 ResNext101: int32 Accumulation: batch_size:50 P60394552 Xray OCR: int16 Accumulation: P60394527 Xray OCR: int32 Accumulation: P60394534 Reviewed By: jspark1105 Differential Revision: D13286215 fbshipit-source-id: e78b691999006c25e92a746783b8bd1b87703a38
Diffstat (limited to 'bench')
-rw-r--r--bench/Im2ColFusedRequantizeAcc16Benchmark.cc362
-rw-r--r--bench/Im2ColFusedRequantizeBenchmark.cc (renamed from bench/Im2ColFusedRequantizeAcc32Benchmark.cc)107
2 files changed, 51 insertions, 418 deletions
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
deleted file mode 100644
index cb2edf5..0000000
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- * All rights reserved.
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <iomanip>
-#include <iostream>
-#include <random>
-#include <vector>
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include "BenchUtils.h"
-#include "fbgemm/Fbgemm.h"
-#include "src/RefImplementations.h"
-
-using namespace std;
-using namespace fbgemm;
-
-void performance_test() {
- vector<conv_param_t<>> shapes = {
- // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w
- conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
- conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
- conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- };
-
- bool flush = true;
- std::vector<char> llc;
-
- if (flush) {
- llc.resize(128 * 1024 * 1024, 1.0);
- }
-
- constexpr int NWARMUP = 4;
- constexpr int NITER = 10;
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- cout << "WARNING: the timer may be inaccurate when used by multiple threads."
- << endl;
- cout << "MB, "
- << "IC, "
- << "OC, "
- << "IH, "
- << "IW, "
- << "KH, "
- << "KW, "
- << "stride_h, "
- << "stride_w, "
- << "pad_h, "
- << "pad_w, "
- << "Type, "
- << "M, "
- << "N, "
- << "K, "
- << "Im2Col (ms), "
- << "Packing (ms), "
- << "Kernel (ms), "
- << "Postprocessing (ms), "
- << "fbgemmPacked (ms), "
- << "Total (ms), "
- << "GOPS" << endl;
-#else
- cout << setw(8) << "MB, "
- << "IC, "
- << "OC, "
- << "IH, "
- << "IW, "
- << "KH, "
- << "KW, "
- << "stride_h, "
- << "stride_w, "
- << "pad_h, "
- << "pad_w, "
- << "Type, "
- << "M, "
- << "N, "
- << "K, " << setw(5) << "GOPS" << endl;
-#endif
-
- chrono::time_point<chrono::high_resolution_clock> begin, end;
- for (auto conv_p : shapes) {
- aligned_vector<uint8_t> Aint8(
- conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
- aligned_vector<uint8_t> Aint8_out(
- conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
- conv_p.K[1] * conv_p.IC);
-
- aligned_vector<int8_t> Bint8(
- conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
-
- aligned_vector<int32_t> Cint32_ref(
- conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
- aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
- aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
-
- // A matrix (input activations)
- randFill<uint8_t>(Aint8, 0, 5);
- int32_t Aint8_zero_point = 4;
- aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
-
- // B matrix (weights)
- randFill<int8_t>(Bint8, -4, 4);
- // int32_t Bint8_zero_point = -3;
- aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
-
- // reference implementation
- conv_ref(
- conv_p,
- Aint8.data(),
- Aint8_zero_point,
- Bint8.data(),
- Cint32_ref.data());
-
- // matrix dimensions after im2col
- int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
- int NDim = conv_p.OC;
- int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
-
- // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), KDim, NDim, NDim,
- // "B unpacked");
- // packedB.printPackedMatrix("B Packed");
-
- double nops = 2.0 * static_cast<double>(NITER) * MDim * NDim * KDim;
- double ttot = 0.0;
- string runType;
-
- vector<int32_t> row_offset_buf(
- PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
-
- PackAWithIm2Col<uint8_t, int16_t> packA(
- conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
-
- PackBMatrix<int8_t, int16_t> packedB(
- matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
-
- // no-op output process objects
- DoNothing<int32_t, int32_t> doNothing32BitObj;
- memCopy<> memcopyObj(doNothing32BitObj);
-
- runType = "FusedIm2Col";
- ttot = 0;
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- double im2col_time = 0.0;
- double total_im2col_time = 0.0;
- double total_packing_time = 0.0;
- double total_computing_time = 0.0;
- double total_kernel_time = 0.0;
- double total_postprocessing_time = 0.0;
- double total_run_time = 0.0;
-#endif
- for (auto i = 0; i < NWARMUP + NITER; ++i) {
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- packing_time = 0.0;
- computing_time = 0.0;
- kernel_time = 0.0;
- postprocessing_time = 0.0;
- run_time = 0.0;
-#endif
- llc_flush(llc);
- begin = chrono::high_resolution_clock::now();
- fbgemmPacked(
- packA,
- packedB,
- Cint32_fb.data(),
- Cint32_fb.data(),
- NDim,
- memcopyObj,
- 0,
- 1);
- end = chrono::high_resolution_clock::now();
-
- if (i >= NWARMUP) {
- auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin);
- ttot += dur.count();
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- total_packing_time += packing_time;
- total_computing_time += computing_time;
- total_kernel_time += kernel_time;
- total_postprocessing_time += postprocessing_time;
- total_run_time += run_time;
-#endif
- }
- }
-
- cout << setw(4) << conv_p.MB << ", " << conv_p.IC << ", " << conv_p.OC
- << ", " << conv_p.IN_DIM[0] << ", " << conv_p.IN_DIM[1] << ", "
- << conv_p.G << ", " << conv_p.K[0] << ", " << conv_p.K[1] << ", "
- << conv_p.stride[0] << ", " << conv_p.stride[1] << ", "
- << conv_p.pad[0] << ", " << conv_p.pad[1] << ", ";
-
- cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5)
- << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6)
- << KDim << ", ";
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- cout << fixed << setprecision(6) << setw(8) << 0 << ", "
- << total_packing_time / (double)NITER / 1e6 << ", "
- << total_kernel_time / (double)NITER / 1e6 << ", "
- << total_postprocessing_time / (double)NITER / 1e6 << ", "
- << total_run_time / (double)NITER / 1e6 << ", "
- << ttot / (double)NITER / 1e6 << ", ";
-#endif
- cout << setprecision(2) << nops / ttot << endl;
-
- compare_buffers(Cint32_ref.data(), Cint32_fb.data(), MDim, NDim, NDim, 5);
-
- runType = "UnfusedIm2Col";
- row_offset_buf.resize(
- PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
- ttot = 0;
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- total_im2col_time = 0.0;
- total_packing_time = 0.0;
- total_computing_time = 0.0;
- total_kernel_time = 0.0;
- total_postprocessing_time = 0.0;
- total_run_time = 0.0;
-#endif
- for (auto i = 0; i < NWARMUP + NITER; ++i) {
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- im2col_time = 0.0;
- packing_time = 0.0;
- computing_time = 0.0;
- kernel_time = 0.0;
- postprocessing_time = 0.0;
- run_time = 0.0;
-#endif
- llc_flush(llc);
- begin = chrono::high_resolution_clock::now();
-
- im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_out.data());
-
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- end = chrono::high_resolution_clock::now();
- im2col_time =
- chrono::duration_cast<chrono::nanoseconds>(end - begin).count();
-#endif
-
- // printMatrix(matrix_op_t::NoTranspose, Aint8_out.data(), MDim, KDim,
- // KDim, "A_out after im2col unpacked");
-
- PackAWithRowOffset<uint8_t, int16_t> packAN(
- matrix_op_t::NoTranspose,
- MDim,
- KDim,
- Aint8_out.data(),
- KDim,
- nullptr,
- 1,
- row_offset_buf.data());
-
- fbgemmPacked(
- packAN,
- packedB,
- Cint32_fb2.data(),
- Cint32_fb2.data(),
- NDim,
- memcopyObj,
- 0,
- 1);
- end = chrono::high_resolution_clock::now();
-
- if (i >= NWARMUP) {
- auto dur = chrono::duration_cast<chrono::nanoseconds>(end - begin);
- ttot += dur.count();
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- total_im2col_time += im2col_time;
- total_packing_time += packing_time;
- total_computing_time += computing_time;
- total_kernel_time += kernel_time;
- total_postprocessing_time += postprocessing_time;
- total_run_time += run_time;
-#endif
- }
- }
-
- ((volatile char*)(llc.data()));
-
- // packedB.printPackedMatrix("bench B Packed");
- // printMatrix(matrix_op_t::NoTranspose, Cint32_fb.data(), MDim, NDim, NDim,
- // "C fb fp32");
- // printMatrix(matrix_op_t::NoTranspose, Cint32_fb2.data(),
- // MDim, NDim, NDim, "C fb2 fp32");
- // printMatrix(matrix_op_t::NoTranspose,
- // Cint32_ref.data(), MDim, NDim, NDim, "C ref fp32");
-
- cout << setw(4) << conv_p.MB << ", " << conv_p.IC << ", " << conv_p.OC
- << ", " << conv_p.IN_DIM[0] << ", " << conv_p.IN_DIM[1] << ", "
- << conv_p.G << ", " << conv_p.K[0] << ", " << conv_p.K[1] << ", "
- << conv_p.stride[0] << ", " << conv_p.stride[1] << ", "
- << conv_p.pad[0] << ", " << conv_p.pad[1] << ", ";
-
- cout << setw(13) << runType << ", " << setw(5) << fixed << setw(5)
- << setw(6) << MDim << ", " << setw(6) << NDim << ", " << setw(6)
- << KDim << ", ";
-#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
- cout << fixed << setprecision(6) << setw(8)
- << total_im2col_time / (double)NITER / 1e6 << ", "
- << total_packing_time / (double)NITER / 1e6 << ", "
- << total_kernel_time / (double)NITER / 1e6 << ", "
- << total_postprocessing_time / (double)NITER / 1e6 << ", "
- << total_run_time / (double)NITER / 1e6 << ", "
- << ttot / (double)NITER / 1e6 << ", ";
-#endif
- cout << setprecision(2) << nops / ttot << endl;
-
- compare_buffers(Cint32_ref.data(), Cint32_fb2.data(), MDim, NDim, NDim, 5);
- } // shapes
-}
-
-int main() {
-#ifdef _OPENMP
- omp_set_num_threads(1);
-#endif
- performance_test();
- return 0;
-}
diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeBenchmark.cc
index 8e112d8..d564ad4 100644
--- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeBenchmark.cc
@@ -23,51 +23,37 @@
using namespace std;
using namespace fbgemm;
+template <typename Acc_t>
void performance_test() {
vector<conv_param_t<>> shapes = {
- // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w
- conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
- conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}),
- conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}),
- conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
+ // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right
+ // ResNext 101
+ // Batch size = 1
+ conv_param_t<>(1, 3, 64, {224, 224}, 1, {7, 7}, {2, 2}, {3, 3, 3, 3}),
+ conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 256, 256, {56, 56}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 256, 256, {28, 28}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 512, 512, {28, 28}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 1024, 1024, {14, 14}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(1, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ // Batch size = 50
+ conv_param_t<>(50, 3, 64, {224, 224}, 1, {7, 7}, {2, 2}, {3, 3, 3, 3}),
+ conv_param_t<>(50, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(50, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(50, 256, 256, {56, 56}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(50, 256, 256, {28, 28}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(50, 512, 512, {28, 28}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(50, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(50, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(
+ 50, 1024, 1024, {14, 14}, 32, {3, 3}, {2, 2}, {1, 1, 1, 1}),
+ conv_param_t<>(50, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
+ conv_param_t<>(50, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
};
bool flush = true;
@@ -88,6 +74,7 @@ void performance_test() {
<< "OC, "
<< "IH, "
<< "IW, "
+ << "G, "
<< "KH, "
<< "KW, "
<< "stride_h, "
@@ -111,6 +98,7 @@ void performance_test() {
<< "OC, "
<< "IH, "
<< "IW, "
+ << "G, "
<< "KH, "
<< "KW, "
<< "stride_h, "
@@ -125,6 +113,9 @@ void performance_test() {
chrono::time_point<chrono::high_resolution_clock> begin, end;
for (auto conv_p : shapes) {
+ if (conv_p.IC % conv_p.G != 0 || conv_p.OC % conv_p.G != 0) {
+ continue;
+ }
aligned_vector<uint8_t> Aint8(
conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
@@ -140,12 +131,9 @@ void performance_test() {
aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
- // cout << conv_p.toString() << endl;
-
// A matrix (input activations)
randFill<uint8_t>(Aint8, 0, 5);
int32_t Aint8_zero_point = 4;
- aligned_vector<float> Apf32(Aint8.begin(), Aint8.end());
// B matrix (weights)
randFill<int8_t>(Bint8, -4, 4);
@@ -162,7 +150,7 @@ void performance_test() {
// matrix dimensions after im2col
int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
- int NDim = conv_p.OC;
+ int NDim = conv_p.OC / conv_p.G;
int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
// printMatrix(matrix_op_t::NoTranspose, Bint8.data(), KDim, NDim, NDim,
@@ -174,13 +162,19 @@ void performance_test() {
string runType;
vector<int32_t> row_offset_buf(
- PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
+ PackAWithIm2Col<uint8_t, Acc_t>::rowOffsetBufferSize());
- PackAWithIm2Col<uint8_t, int32_t> packA(
+ PackAWithIm2Col<uint8_t, Acc_t> packA(
conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data());
- PackBMatrix<int8_t, int32_t> packedB(
- matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim);
+ PackBMatrix<int8_t, Acc_t> packedB(
+ matrix_op_t::NoTranspose,
+ KDim,
+ NDim,
+ Bint8.data(),
+ NDim,
+ nullptr,
+ conv_p.G);
// no-op output process objects
DoNothing<int32_t, int32_t> doNothing32BitObj;
@@ -212,7 +206,7 @@ void performance_test() {
packedB,
Cint32_fb.data(),
Cint32_fb.data(),
- NDim,
+ conv_p.G * NDim,
memcopyObj,
0,
1);
@@ -285,14 +279,14 @@ void performance_test() {
// printMatrix(matrix_op_t::NoTranspose, Aint8_out.data(), MDim, KDim,
// KDim, "A_out after im2col unpacked");
- PackAWithRowOffset<uint8_t, int32_t> packAN(
+ PackAWithRowOffset<uint8_t, Acc_t> packAN(
matrix_op_t::NoTranspose,
MDim,
KDim,
Aint8_out.data(),
KDim,
nullptr,
- 1,
+ conv_p.G,
row_offset_buf.data());
fbgemmPacked(
@@ -300,7 +294,7 @@ void performance_test() {
packedB,
Cint32_fb2.data(),
Cint32_fb2.data(),
- NDim,
+ conv_p.G * NDim,
memcopyObj,
0,
1);
@@ -358,6 +352,7 @@ int main() {
#ifdef _OPENMP
omp_set_num_threads(1);
#endif
- performance_test();
+ performance_test<int16_t>();
+ performance_test<int32_t>();
return 0;
}