From 428a0b6cede232eb5c4e9c3bbd8e9d74d8e34500 Mon Sep 17 00:00:00 2001 From: Jianyu Huang Date: Thu, 8 Nov 2018 11:09:04 -0800 Subject: Sync with internal copy: Asymmetric padding; fbgemm2 -> fbgemm --- bench/BenchUtils.cc | 4 +- bench/BenchUtils.h | 4 +- bench/Depthwise3DBenchmark.cc | 2 +- bench/DepthwiseBenchmark.cc | 2 +- bench/FP16Benchmark.cc | 2 +- bench/I8SpmdmBenchmark.cc | 4 +- bench/Im2ColFusedRequantizeAcc16Benchmark.cc | 86 ++--- bench/Im2ColFusedRequantizeAcc32Benchmark.cc | 86 ++--- bench/PackedFloatInOutBenchmark.cc | 2 +- bench/PackedRequantizeAcc16Benchmark.cc | 2 +- bench/PackedRequantizeAcc32Benchmark.cc | 2 +- include/fbgemm/ConvUtils.h | 20 +- include/fbgemm/Fbgemm.h | 4 +- include/fbgemm/FbgemmFP16.h | 2 +- include/fbgemm/FbgemmI8Spmdm.h | 4 +- include/fbgemm/Types.h | 2 +- include/fbgemm/Utils.h | 4 +- src/ExecuteKernel.cc | 2 +- src/ExecuteKernelGeneric.h | 4 +- src/ExecuteKernelU8S8.cc | 28 +- src/ExecuteKernelU8S8.h | 4 +- src/Fbgemm.cc | 46 ++- src/FbgemmFP16.cc | 2 +- src/FbgemmFP16UKernels.cc | 4 +- src/FbgemmFP16UKernels.h | 4 +- src/FbgemmI8Depthwise.cc | 5 +- src/FbgemmI8Depthwise.h | 4 +- src/FbgemmI8Spmdm.cc | 4 +- src/GenerateKernel.h | 4 +- src/GenerateKernelU8S8S32ACC16.cc | 4 +- src/GenerateKernelU8S8S32ACC16_avx512.cc | 4 +- src/GenerateKernelU8S8S32ACC32.cc | 4 +- src/GenerateKernelU8S8S32ACC32_avx512.cc | 4 +- src/PackAMatrix.cc | 4 +- src/PackAWithIm2Col.cc | 211 +++++++++---- src/PackBMatrix.cc | 18 +- src/PackMatrix.cc | 4 +- src/PackWithQuantRowOffset.cc | 4 +- src/PackWithRowOffset.cc | 4 +- src/RefImplementations.cc | 6 +- src/RefImplementations.h | 4 +- src/Utils.cc | 4 +- src/Utils_avx512.cc | 4 +- src/codegen_fp16fp32.cc | 2 +- test/FP16Test.cc | 4 +- test/I8DepthwiseTest.cc | 4 +- test/I8DepthwiseTest.h | 4 +- test/I8SpmdmTest.cc | 2 +- test/Im2ColFusedRequantizeTest.cc | 448 ++++++++++++++++++++------- test/PackedRequantizeAcc16Test.cc | 2 +- test/PackedRequantizeTest.cc | 2 +- test/QuantizationHelpers.cc | 4 +- test/QuantizationHelpers.h | 4 +- test/TestUtils.cc | 4 +- test/TestUtils.h | 4 +- 55 files changed, 734 insertions(+), 372 deletions(-) diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc index 5dade2a..7b4cde4 100644 --- a/bench/BenchUtils.cc +++ b/bench/BenchUtils.cc @@ -7,7 +7,7 @@ #include "BenchUtils.h" #include -namespace fbgemm2 { +namespace fbgemm { std::default_random_engine eng; @@ -41,4 +41,4 @@ void llc_flush(std::vector& llc) { } } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h index 5dd452e..5c16a06 100644 --- a/bench/BenchUtils.h +++ b/bench/BenchUtils.h @@ -8,11 +8,11 @@ #include #include "bench/AlignedVec.h" -namespace fbgemm2 { +namespace fbgemm { template void randFill(aligned_vector &vec, const int low, const int high); void llc_flush(std::vector& llc); -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc index b7c7d44..417ddd1 100644 --- a/bench/Depthwise3DBenchmark.cc +++ b/bench/Depthwise3DBenchmark.cc @@ -24,7 +24,7 @@ #include "BenchUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; int main() { // Depthwise is memory BW bound so we want to flush LLC. diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc index de08ff7..0bf2d73 100644 --- a/bench/DepthwiseBenchmark.cc +++ b/bench/DepthwiseBenchmark.cc @@ -22,7 +22,7 @@ #include "BenchUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; int main() { // From Xray OCR diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc index f5ec10f..8fbe878 100644 --- a/bench/FP16Benchmark.cc +++ b/bench/FP16Benchmark.cc @@ -21,7 +21,7 @@ #include "AlignedVec.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; void performance_test() { // cache flush diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc index f97d152..d361bb5 100644 --- a/bench/I8SpmdmBenchmark.cc +++ b/bench/I8SpmdmBenchmark.cc @@ -22,7 +22,7 @@ #include "BenchUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; int main() { const vector> shapes = { @@ -79,7 +79,7 @@ int main() { aligned_vector A(M * K); randFill(A, 0, 255); - fbgemm2::CompressedSparseColumn B_csc(K, N); + fbgemm::CompressedSparseColumn B_csc(K, N); vector C(M * N); vector C_ref(C.size()); diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc index ca27278..48b744a 100644 --- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc +++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc @@ -21,53 +21,53 @@ #include "BenchUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; void performance_test() { vector> shapes = { // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w - conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0}), - conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0}), - conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1}), + conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}), + conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}), + conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), }; bool flush = true; diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc index 8cce235..9201e52 100644 --- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc +++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc @@ -21,53 +21,53 @@ #include "BenchUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; void performance_test() { vector> shapes = { // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w - conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0}), - conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0}), - conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1}), + conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}), + conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {2, 2}, {0, 0, 0, 0}), + conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), }; bool flush = true; diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc index 4a2eda4..29a7547 100644 --- a/bench/PackedFloatInOutBenchmark.cc +++ b/bench/PackedFloatInOutBenchmark.cc @@ -25,7 +25,7 @@ #include "BenchUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; void performance_test() { vector> shapes = { diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc index a758f55..4974ba2 100644 --- a/bench/PackedRequantizeAcc16Benchmark.cc +++ b/bench/PackedRequantizeAcc16Benchmark.cc @@ -25,7 +25,7 @@ #include "BenchUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; enum class BenchmarkType { BARE_BONE, // no row-offset in input packing, and no output processing diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc index 27f1433..94bb899 100644 --- a/bench/PackedRequantizeAcc32Benchmark.cc +++ b/bench/PackedRequantizeAcc32Benchmark.cc @@ -25,7 +25,7 @@ #include "BenchUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; void performance_test() { vector> shapes = { diff --git a/include/fbgemm/ConvUtils.h b/include/fbgemm/ConvUtils.h index 438807f..667998c 100644 --- a/include/fbgemm/ConvUtils.h +++ b/include/fbgemm/ConvUtils.h @@ -9,7 +9,7 @@ #include #include -namespace fbgemm2 { +namespace fbgemm { /** * @brief A struct to conveniently store all convolution parameters. @@ -23,7 +23,9 @@ struct conv_param_t { int G; ///< Number of Groups std::array K; ///< Filter (Kernel) dimensions std::array stride; //< Strides - std::array pad; //< Padding (assume symmetric padding) + std::array + pad; //< Padding (first SPATIAL_DIM is for prev/top/left padding, second + //SPATIAL_DIM is for next/bottom/right padding) std::array dilation; //< Kernel dilation // The following are derived parameters @@ -42,7 +44,7 @@ struct conv_param_t { int g, std::array k, std::array strd, - std::array pd) + std::array pd) : MB(mb), IC(ic), OC(oc), @@ -53,7 +55,7 @@ struct conv_param_t { pad(pd) { for (int d = 0; d < SPATIAL_DIM; ++d) { dilation[d] = 1; - IN_DIMP[d] = IN_DIM[d] + 2 * pad[d]; + IN_DIMP[d] = IN_DIM[d] + pad[d] + pad[SPATIAL_DIM + d]; OUT_DIM[d] = (IN_DIMP[d] - K[d]) / stride[d] + 1; } } @@ -89,10 +91,10 @@ struct conv_param_t { out += "stride_" + dim_string[3 - SPATIAL_DIM + d] + ":" + std::to_string(stride[d]) + ", "; } - for (int d = 0; d < SPATIAL_DIM; ++d) { - out += "pad_" + dim_string[3 - SPATIAL_DIM + d] + ":" + + for (int d = 0; d < SPATIAL_DIM * 2; ++d) { + out += "pad_" + dim_string[3 - (SPATIAL_DIM % 3) + d] + ":" + std::to_string(pad[d]); - if (d < SPATIAL_DIM - 1) { + if (d < SPATIAL_DIM * 2 - 1) { out += ", "; } } @@ -106,7 +108,7 @@ struct conv_param_t { } for (int d = 0; d < SPATIAL_DIM; ++d) { out += "pad_" + std::to_string(d) + ":" + std::to_string(pad[d]); - if (d < SPATIAL_DIM - 1) { + if (d < SPATIAL_DIM * 2 - 1) { out += ", "; } } @@ -115,4 +117,4 @@ struct conv_param_t { } }; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index 081c03b..824a966 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -32,7 +32,7 @@ extern double postprocessing_time; extern double run_time; #endif -namespace fbgemm2 { +namespace fbgemm { /** * @brief Templatized struct for packing parameters for A and B matrices. @@ -969,4 +969,4 @@ static void* fbgemmAlignedAlloc(size_t __align, size_t __size) { return aligned_mem; } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h index 0428d93..1083fa3 100644 --- a/include/fbgemm/FbgemmFP16.h +++ b/include/fbgemm/FbgemmFP16.h @@ -17,7 +17,7 @@ #include "Types.h" #include "Utils.h" -namespace fbgemm2 { +namespace fbgemm { /// class that performs packing of matrix in /// row-major format into diff --git a/include/fbgemm/FbgemmI8Spmdm.h b/include/fbgemm/FbgemmI8Spmdm.h index 264b70e..3e040ad 100644 --- a/include/fbgemm/FbgemmI8Spmdm.h +++ b/include/fbgemm/FbgemmI8Spmdm.h @@ -23,7 +23,7 @@ extern double spmdm_transpose_Nx32_time; extern double spmdm_run_time; #endif -namespace fbgemm2 { +namespace fbgemm { /** * @brief A class to represent a matrix in Compressed Sparse Column (CSC) @@ -98,4 +98,4 @@ class CompressedSparseColumn { mutable std::int32_t old_nnz_; }; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h index c5c62dd..0b25c22 100644 --- a/include/fbgemm/Types.h +++ b/include/fbgemm/Types.h @@ -10,7 +10,7 @@ #include #include -namespace fbgemm2 { +namespace fbgemm { typedef struct __attribute__((aligned(2))) __f16 { uint16_t x; diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h index 22e5a16..7283932 100644 --- a/include/fbgemm/Utils.h +++ b/include/fbgemm/Utils.h @@ -8,7 +8,7 @@ #include #include -namespace fbgemm2 { +namespace fbgemm { /** * @brief Helper struct to type specialize for uint8 and int8 together. @@ -120,4 +120,4 @@ void transpose_16x16( float* dst, int ld_dst); -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/ExecuteKernel.cc b/src/ExecuteKernel.cc index 0e3d122..3bc7e36 100644 --- a/src/ExecuteKernel.cc +++ b/src/ExecuteKernel.cc @@ -9,4 +9,4 @@ #include "fbgemm/Fbgemm.h" #include "fbgemm/Utils.h" -namespace fbgemm2 {} // namespace fbgemm2 +namespace fbgemm {} // namespace fbgemm diff --git a/src/ExecuteKernelGeneric.h b/src/ExecuteKernelGeneric.h index e83e943..4649912 100644 --- a/src/ExecuteKernelGeneric.h +++ b/src/ExecuteKernelGeneric.h @@ -9,7 +9,7 @@ #include "fbgemm/Fbgemm.h" #include "GenerateKernel.h" -namespace fbgemm2 { +namespace fbgemm { /** * @brief Execute Engine for the macro-kernel and output processing. @@ -61,4 +61,4 @@ class ExecuteKernel : public CodeGenBase< ///< the C tile in the macro-kernel. }; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index e091a87..b3f8c15 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -14,7 +14,7 @@ double kernel_time = 0.0; double postprocessing_time = 0.0; #endif -namespace fbgemm2 { +namespace fbgemm { template ExecuteKernel< @@ -327,6 +327,18 @@ template class ExecuteKernel< int32_t, memCopy<>>; +template class ExecuteKernel< + PackAWithIm2Col, + PackBMatrix, + uint8_t, + ReQuantizeOutput>; + +template class ExecuteKernel< + PackAWithIm2Col, + PackBMatrix, + uint8_t, + ReQuantizeOutput>; + template class ExecuteKernel< PackAWithRowOffset, PackBMatrix, @@ -345,6 +357,18 @@ template class ExecuteKernel< int32_t, memCopy<>>; +template class ExecuteKernel< + PackAWithIm2Col, + PackBMatrix, + uint8_t, + ReQuantizeOutput>; + +template class ExecuteKernel< + PackAWithIm2Col, + PackBMatrix, + uint8_t, + ReQuantizeOutput>; + template class ExecuteKernel< PackAWithQuantRowOffset, PackBMatrix, @@ -363,4 +387,4 @@ template class ExecuteKernel< int32_t, DoNothing>; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/ExecuteKernelU8S8.h b/src/ExecuteKernelU8S8.h index 0bd7fc5..dfa6577 100644 --- a/src/ExecuteKernelU8S8.h +++ b/src/ExecuteKernelU8S8.h @@ -7,7 +7,7 @@ #pragma once #include "ExecuteKernel.h" -namespace fbgemm2 { +namespace fbgemm { /** * @brief Execute Engine of uint 8 and int8 matrix @@ -70,4 +70,4 @@ class ExecuteKernel< int nbSize_; ///< block size in the n dimension. }; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index 9195a05..f8f0d34 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -15,9 +15,9 @@ double computing_time = 0.0; double run_time = 0.0; #endif -using namespace fbgemm2; +using namespace fbgemm; -namespace fbgemm2 { +namespace fbgemm { template < typename packingAMatrix, @@ -245,6 +245,26 @@ template void fbgemmPacked( int thread_id, int num_threads); +template void fbgemmPacked( + PackMatrix, uint8_t, int32_t>& packA, + PackMatrix, int8_t, int32_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeOutput& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix, uint8_t, int32_t>& packA, + PackMatrix, int8_t, int32_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeOutput& outProcess, + int thread_id, + int num_threads); + template void fbgemmPacked( PackMatrix, uint8_t, int32_t>& packA, @@ -360,6 +380,26 @@ template void fbgemmPacked( int thread_id, int num_threads); +template void fbgemmPacked( + PackMatrix, uint8_t, int16_t>& packA, + PackMatrix, int8_t, int16_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeOutput& outProcess, + int thread_id, + int num_threads); + +template void fbgemmPacked( + PackMatrix, uint8_t, int16_t>& packA, + PackMatrix, int8_t, int16_t>& packB, + uint8_t* C, + int32_t* C_buffer, + uint32_t ldc, + const ReQuantizeOutput& outProcess, + int thread_id, + int num_threads); + template void fbgemmPacked( PackMatrix, uint8_t, int16_t>& packA, PackMatrix, int8_t, int16_t>& packB, @@ -380,4 +420,4 @@ template void fbgemmPacked( int thread_id, int num_threads); -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/FbgemmFP16.cc b/src/FbgemmFP16.cc index eff173f..f54feb8 100644 --- a/src/FbgemmFP16.cc +++ b/src/FbgemmFP16.cc @@ -14,7 +14,7 @@ using namespace std; -namespace fbgemm2 { +namespace fbgemm { /// class that performs packing of matrix in /// row-major or col-major format into diff --git a/src/FbgemmFP16UKernels.cc b/src/FbgemmFP16UKernels.cc index ec1b297..cc1273e 100644 --- a/src/FbgemmFP16UKernels.cc +++ b/src/FbgemmFP16UKernels.cc @@ -6,7 +6,7 @@ */ #include "FbgemmFP16UKernels.h" -namespace fbgemm2 { +namespace fbgemm { void __attribute__ ((noinline)) gemmkernel_1x1_AVX2_fA0fB0fC0(GemmParams *gp) { @@ -2200,4 +2200,4 @@ asm volatile ); } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/FbgemmFP16UKernels.h b/src/FbgemmFP16UKernels.h index bf7f247..88a136b 100644 --- a/src/FbgemmFP16UKernels.h +++ b/src/FbgemmFP16UKernels.h @@ -11,7 +11,7 @@ #include #include "fbgemm/Types.h" -namespace fbgemm2 { +namespace fbgemm { using fp16 = float16; using fp32 = float; @@ -35,6 +35,6 @@ void __attribute__ ((noinline)) gemmkernel_14x1_AVX2_fA0fB0fC0(GemmParams *gp); typedef void (* funcptr_fp16) (GemmParams *gp); ; -} // namespace fbgemm2 +} // namespace fbgemm #endif diff --git a/src/FbgemmI8Depthwise.cc b/src/FbgemmI8Depthwise.cc index 551e98e..ed64859 100644 --- a/src/FbgemmI8Depthwise.cc +++ b/src/FbgemmI8Depthwise.cc @@ -18,7 +18,8 @@ using namespace std; -namespace fbgemm2 { +namespace fbgemm +{ static array, 8> masks = {{ { 0, 0, 0, 0, 0, 0, 0, 0, }, @@ -2767,4 +2768,4 @@ void depthwise_3x3_per_channel_quantization_pad_1( } } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/FbgemmI8Depthwise.h b/src/FbgemmI8Depthwise.h index bc62c84..a60cb58 100644 --- a/src/FbgemmI8Depthwise.h +++ b/src/FbgemmI8Depthwise.h @@ -8,7 +8,7 @@ #include -namespace fbgemm2 +namespace fbgemm { // KERNEL_PROD is the product of all kernels. @@ -102,4 +102,4 @@ void depthwise_3x3x3_pad_1( const std::int32_t* col_offsets, const std::int32_t* bias, bool fuse_relu = false, int thread_id = 0, int num_threads = 1); -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc index 723a467..12e1cb2 100644 --- a/src/FbgemmI8Spmdm.cc +++ b/src/FbgemmI8Spmdm.cc @@ -25,7 +25,7 @@ double spmdm_run_time = 0.0; using namespace std; -namespace fbgemm2 { +namespace fbgemm { CompressedSparseColumn::CompressedSparseColumn(int num_of_rows, int num_of_cols) : num_rows_(num_of_rows), @@ -505,4 +505,4 @@ void CompressedSparseColumn::SpMDM( #endif } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/GenerateKernel.h b/src/GenerateKernel.h index 30160d1..5a75c33 100644 --- a/src/GenerateKernel.h +++ b/src/GenerateKernel.h @@ -11,7 +11,7 @@ #include #include "fbgemm/Fbgemm.h" -namespace fbgemm2 { +namespace fbgemm { namespace x86 = asmjit::x86; @@ -151,4 +151,4 @@ thread_local std::map< typename CodeGenBase::jit_micro_kernel_fp> CodeGenBase::codeCache_; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/GenerateKernelU8S8S32ACC16.cc b/src/GenerateKernelU8S8S32ACC16.cc index 451592a..b9ab727 100644 --- a/src/GenerateKernelU8S8S32ACC16.cc +++ b/src/GenerateKernelU8S8S32ACC16.cc @@ -7,7 +7,7 @@ #include #include "GenerateKernel.h" -namespace fbgemm2 { +namespace fbgemm { namespace x86 = asmjit::x86; @@ -295,4 +295,4 @@ CodeGenBase::getOrCreate( return fn; } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/GenerateKernelU8S8S32ACC16_avx512.cc b/src/GenerateKernelU8S8S32ACC16_avx512.cc index cab43ed..eeeaea0 100644 --- a/src/GenerateKernelU8S8S32ACC16_avx512.cc +++ b/src/GenerateKernelU8S8S32ACC16_avx512.cc @@ -7,7 +7,7 @@ #include #include "GenerateKernel.h" -namespace fbgemm2 { +namespace fbgemm { namespace x86 = asmjit::x86; @@ -298,4 +298,4 @@ CodeGenBase::getOrCreate( return fn; } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/GenerateKernelU8S8S32ACC32.cc b/src/GenerateKernelU8S8S32ACC32.cc index 9529f5d..31c9996 100644 --- a/src/GenerateKernelU8S8S32ACC32.cc +++ b/src/GenerateKernelU8S8S32ACC32.cc @@ -7,7 +7,7 @@ #include #include "GenerateKernel.h" -namespace fbgemm2 { +namespace fbgemm { namespace x86 = asmjit::x86; @@ -312,4 +312,4 @@ CodeGenBase::getOrCreate( return fn; } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/GenerateKernelU8S8S32ACC32_avx512.cc b/src/GenerateKernelU8S8S32ACC32_avx512.cc index 251a8b8..0621bb0 100644 --- a/src/GenerateKernelU8S8S32ACC32_avx512.cc +++ b/src/GenerateKernelU8S8S32ACC32_avx512.cc @@ -7,7 +7,7 @@ #include #include "GenerateKernel.h" -namespace fbgemm2 { +namespace fbgemm { namespace x86 = asmjit::x86; @@ -314,4 +314,4 @@ CodeGenBase::getOrCreate( return fn; } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc index 8f260ba..cd991ca 100644 --- a/src/PackAMatrix.cc +++ b/src/PackAMatrix.cc @@ -10,7 +10,7 @@ #include #include "fbgemm/Fbgemm.h" -namespace fbgemm2 { +namespace fbgemm { template PackAMatrix::PackAMatrix( @@ -162,4 +162,4 @@ void PackAMatrix::printPackedMatrix(std::string name) { template class PackAMatrix; template class PackAMatrix; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index 8dde696..71efced 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -13,7 +13,7 @@ #include "fbgemm/Fbgemm.h" -namespace fbgemm2 { +namespace fbgemm { template PackAWithIm2Col::PackAWithIm2Col( @@ -82,9 +82,122 @@ void PackAWithIm2Col::pack(const block_type_t& block) { row_interleave_B_ * row_interleave_B_}; BaseType::packedBlock(block_p); T* out = BaseType::getBuf(); + // accumulate into row offset? + bool row_offset_acc = (block.col_start != 0); + int32_t* row_offset_buf = getRowOffsetBuffer(); - if (SPATIAL_DIM == 3) { // static if + bool point_wise = true; + for (int d = 0; d < SPATIAL_DIM; ++d) { + if (conv_p_.K[d] != 1 || conv_p_.pad[d] != 0 || conv_p_.stride[d] != 1 || + conv_p_.dilation[d] != 1) { + point_wise = false; + break; + } + } + for (int d = SPATIAL_DIM; d < SPATIAL_DIM * 2; ++d) { + if (conv_p_.pad[d] != 0) { + point_wise = false; + break; + } + } + + if (point_wise) { + int32_t ld = this->numCols(); for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + int buf_idx = i - block.row_start; + memcpy( + out + buf_idx * BaseType::blockColSize(), + sdata_ + i * ld + block.col_start, + block.col_size * sizeof(T)); + // zero fill + for (int j = block.col_size; j < block_p.col_size; ++j) { + out[buf_idx * BaseType::blockColSize() + j] = 0; + } + int32_t row_sum = + row_offset_acc ? row_offset_buf[i - block.row_start] : 0; + __m256i sum_v = _mm256_setzero_si256(); + __m256i one_epi16_v = _mm256_set1_epi16(1); + __m256i one_epi8_v = _mm256_set1_epi8(1); + for (int j = block.col_start; + j < block.col_start + block.col_size / 32 * 32; + j += 32) { + __m256i src_v = _mm256_loadu_si256( + reinterpret_cast<__m256i const*>(sdata_ + i * ld + j)); + sum_v = _mm256_add_epi32( + sum_v, + _mm256_madd_epi16( + _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v)); + } + for (int j = block.col_start + block.col_size / 32 * 32; + j < block.col_start + block.col_size; + ++j) { + row_sum += sdata_[i * ld + j]; + } + // alignas(64) std::array temp; + alignas(64) std::int32_t temp[8]; + //_mm256_store_si256(reinterpret_cast<__m256i*>(temp.data()), sum_v); + _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v); + for (int k = 0; k < 8; ++k) { + row_sum += temp[k]; + } + row_offset_buf[i - block.row_start] = row_sum; + } + + return; + } + + if (SPATIAL_DIM != 2 && SPATIAL_DIM != 3) { + assert(false && "unsupported conv dimension"); + } + + for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { + if (SPATIAL_DIM == 2) { // static if + int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]); + int hw = i % (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]); + int w = hw % conv_p_.OUT_DIM[1]; + int h = hw / conv_p_.OUT_DIM[1]; + for (int j = block.col_start; + j < block.col_start + block.col_size + conv_p_.IC - 1; + j += conv_p_.IC) { + int j_blk_id = j / conv_p_.IC; + // max( j_blk_id * IC, START) -> min( END, (j_blk_id + 1) * IC ) + int j_blk_start = std::max(j_blk_id * conv_p_.IC, block.col_start); + int j_blk_end = std::min( + (j_blk_id + 1) * conv_p_.IC, block.col_start + block.col_size); + if (j_blk_start >= j_blk_end) { + break; + } + + int rs = j / conv_p_.IC; + int s = rs % conv_p_.K[1]; + int r = rs / conv_p_.K[1]; + + int h_in = -conv_p_.pad[0] + h * conv_p_.stride[0] + r; + int w_in = -conv_p_.pad[1] + w * conv_p_.stride[1] + s; + + if (h_in < 0 || h_in >= conv_p_.IN_DIM[0] || w_in < 0 || + w_in >= conv_p_.IN_DIM[1]) { + // Please note that padding for convolution should be filled with + // zero_pt + std::memset( + &out + [(i - block.row_start) * BaseType::blockColSize() + + (j_blk_start - block.col_start)], + BaseType::zeroPoint(), + sizeof(T) * (j_blk_end - j_blk_start)); + } else { + std::memcpy( + &out + [(i - block.row_start) * BaseType::blockColSize() + + j_blk_start - block.col_start], + &sdata_ + [((n * conv_p_.IN_DIM[0] + h_in) * conv_p_.IN_DIM[1] + w_in) * + conv_p_.IC + + (j_blk_start % conv_p_.IC)], + sizeof(T) * (j_blk_end - j_blk_start)); + } + } + } else if (SPATIAL_DIM == 3) { // static if int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1] * conv_p_.OUT_DIM[2]); int thw = @@ -139,72 +252,8 @@ void PackAWithIm2Col::pack(const block_type_t& block) { sizeof(T) * (j_blk_end - j_blk_start)); } } - // zero fill - // Please see the comment in PackAMatrix.cc for zero vs zero_pt fill. - if ((block_p.col_start + block_p.col_size) - - (block.col_start + block.col_size) > - 0) { - std::memset( - &out - [(i - block.row_start) * BaseType::blockColSize() + - (block.col_size)], - 0, - sizeof(T) * - ((block_p.col_start + block_p.col_size) - - (block.col_start + block.col_size))); - } } - return; - } - - assert(SPATIAL_DIM == 2 && "unsupported conv dimension"); - for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { - int n = i / (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]); - int hw = i % (conv_p_.OUT_DIM[0] * conv_p_.OUT_DIM[1]); - int w = hw % conv_p_.OUT_DIM[1]; - int h = hw / conv_p_.OUT_DIM[1]; - for (int j = block.col_start; - j < block.col_start + block.col_size + conv_p_.IC - 1; - j += conv_p_.IC) { - int j_blk_id = j / conv_p_.IC; - // max( j_blk_id * IC, START) -> min( END, (j_blk_id + 1) * IC ) - int j_blk_start = std::max(j_blk_id * conv_p_.IC, block.col_start); - int j_blk_end = std::min( - (j_blk_id + 1) * conv_p_.IC, block.col_start + block.col_size); - if (j_blk_start >= j_blk_end) { - break; - } - - int rs = j / conv_p_.IC; - int s = rs % conv_p_.K[1]; - int r = rs / conv_p_.K[1]; - - int h_in = -conv_p_.pad[0] + h * conv_p_.stride[0] + r; - int w_in = -conv_p_.pad[1] + w * conv_p_.stride[1] + s; - - if (h_in < 0 || h_in >= conv_p_.IN_DIM[0] || w_in < 0 || - w_in >= conv_p_.IN_DIM[1]) { - // Please note that padding for convolution should be filled with - // zero_pt - std::memset( - &out - [(i - block.row_start) * BaseType::blockColSize() + - (j_blk_start - block.col_start)], - BaseType::zeroPoint(), - sizeof(T) * (j_blk_end - j_blk_start)); - } else { - std::memcpy( - &out - [(i - block.row_start) * BaseType::blockColSize() + - j_blk_start - block.col_start], - &sdata_ - [((n * conv_p_.IN_DIM[0] + h_in) * conv_p_.IN_DIM[1] + w_in) * - conv_p_.IC + - (j_blk_start % conv_p_.IC)], - sizeof(T) * (j_blk_end - j_blk_start)); - } - } // zero fill // Please see the comment in PackAMatrix.cc for zero vs zero_pt fill. if ((block_p.col_start + block_p.col_size) - @@ -219,7 +268,33 @@ void PackAWithIm2Col::pack(const block_type_t& block) { ((block_p.col_start + block_p.col_size) - (block.col_start + block.col_size))); } - } + + // TODO: skip row_offset computation when B_zero_point is 0 + int32_t row_sum = + row_offset_acc ? row_offset_buf[i - block.row_start] : 0; + + __m256i sum_v = _mm256_setzero_si256(); + __m256i one_epi16_v = _mm256_set1_epi16(1); + __m256i one_epi8_v = _mm256_set1_epi8(1); + for (int j = 0; j < block.col_size / 32 * 32; j += 32) { + __m256i src_v = _mm256_loadu_si256(reinterpret_cast<__m256i const*>( + out + (i - block.row_start) * this->blockColSize() + j)); + sum_v = _mm256_add_epi32( + sum_v, + _mm256_madd_epi16( + _mm256_maddubs_epi16(src_v, one_epi8_v), one_epi16_v)); + } + for (int j = block.col_size / 32 * 32; j < block.col_size; ++j) { + row_sum += out[(i - block.row_start) * this->blockColSize() + j]; + } + alignas(64) int32_t temp[8]; + _mm256_store_si256(reinterpret_cast<__m256i*>(temp), sum_v); + for (int k = 0; k < 8; ++k) { + row_sum += temp[k]; + } + + row_offset_buf[i - block.row_start] = row_sum; + } // for each i } template @@ -267,4 +342,4 @@ template class PackAWithIm2Col; template class PackAWithIm2Col; template class PackAWithIm2Col; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index 878503f..485afb1 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -10,7 +10,7 @@ #include #include "fbgemm/Fbgemm.h" -namespace fbgemm2 { +namespace fbgemm { template PackBMatrix::PackBMatrix( @@ -163,13 +163,17 @@ bool PackBMatrix::equals(const PackBMatrix& that) const { return false; } - return memcmp( - BaseType::buf_, - that.buf_, - BaseType::blockRows() * BaseType::brow_ * BaseType::blockCols() * - BaseType::bcol_ * sizeof(T)) == 0; + for (int i = 0; i < this->numRows(); ++i) { + for (int j = 0; j < this->numCols(); ++j) { + if (this->buf_[addr(i, j)] != that.buf_[that.addr(i, j)]) { + return false; + } + } + } + + return true; } template class PackBMatrix; template class PackBMatrix; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc index 37b4e88..fd4c766 100644 --- a/src/PackMatrix.cc +++ b/src/PackMatrix.cc @@ -11,7 +11,7 @@ #include "fbgemm/ConvUtils.h" #include "fbgemm/Fbgemm.h" -namespace fbgemm2 { +namespace fbgemm { template PackMatrix::PackMatrix( @@ -91,4 +91,4 @@ template class PackMatrix< template class PackMatrix, uint8_t, int16_t>; template class PackMatrix, int8_t, int16_t>; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/PackWithQuantRowOffset.cc b/src/PackWithQuantRowOffset.cc index 5f60faa..15cd737 100644 --- a/src/PackWithQuantRowOffset.cc +++ b/src/PackWithQuantRowOffset.cc @@ -13,7 +13,7 @@ #include #include "fbgemm/Fbgemm.h" -namespace fbgemm2 { +namespace fbgemm { template PackAWithQuantRowOffset::PackAWithQuantRowOffset( @@ -255,4 +255,4 @@ int PackAWithQuantRowOffset::rowOffsetBufferSize() { template class PackAWithQuantRowOffset; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/PackWithRowOffset.cc b/src/PackWithRowOffset.cc index fa1f2b0..dec3f70 100644 --- a/src/PackWithRowOffset.cc +++ b/src/PackWithRowOffset.cc @@ -12,7 +12,7 @@ #include #include "fbgemm/Fbgemm.h" -namespace fbgemm2 { +namespace fbgemm { template PackAWithRowOffset::PackAWithRowOffset( @@ -211,4 +211,4 @@ int PackAWithRowOffset::rowOffsetBufferSize() { template class PackAWithRowOffset; template class PackAWithRowOffset; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index 4b919c1..dc41c27 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -13,7 +13,7 @@ using namespace std; -namespace fbgemm2 { +namespace fbgemm { void requantize_u8acc32_ref( int M, @@ -195,7 +195,7 @@ void spmdm_ref( int M, const uint8_t* A, int lda, - fbgemm2::CompressedSparseColumn& B, + fbgemm::CompressedSparseColumn& B, bool accumulation, int32_t* C, int ldc) { @@ -746,4 +746,4 @@ void depthwise_3x3x3_pad_1_ref( } }; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/RefImplementations.h b/src/RefImplementations.h index 69d060a..9e81ce1 100644 --- a/src/RefImplementations.h +++ b/src/RefImplementations.h @@ -12,7 +12,7 @@ #include "fbgemm/ConvUtils.h" #include "fbgemm/FbgemmI8Spmdm.h" -namespace fbgemm2 { +namespace fbgemm { /** * @brief Reference implementation of requantization step. @@ -283,4 +283,4 @@ void depthwise_3x3x3_pad_1_ref( const std::int32_t* col_offsets, const std::int32_t* bias); -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/Utils.cc b/src/Utils.cc index 10ab469..45aafd3 100644 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -15,7 +15,7 @@ #include #include -namespace fbgemm2 { +namespace fbgemm { /** * @brief Compare the reference and test result matrix to check the correctness. @@ -354,4 +354,4 @@ void transpose_simd( } } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/Utils_avx512.cc b/src/Utils_avx512.cc index b6bf413..62a99ba 100644 --- a/src/Utils_avx512.cc +++ b/src/Utils_avx512.cc @@ -9,7 +9,7 @@ #include -namespace fbgemm2 { +namespace fbgemm { inline void transpose_kernel_16x16_avx512( const float* src, @@ -240,4 +240,4 @@ void transpose_16x16( transpose_8x8(M - ib, N, &src[ib * ld_src], ld_src, &dst[ib], ld_dst); } -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/src/codegen_fp16fp32.cc b/src/codegen_fp16fp32.cc index 8e36c85..2b2b022 100644 --- a/src/codegen_fp16fp32.cc +++ b/src/codegen_fp16fp32.cc @@ -79,7 +79,7 @@ int main() { hdrfile << "#include \n"; hdrfile << "#include \n"; hdrfile << "#include \"fbgemm/Types.h\"\n"; - hdrfile << "using fp16 = fbgemm2::float16;\n"; + hdrfile << "using fp16 = fbgemm::float16;\n"; hdrfile << "using fp32 = float;\n"; hdrfile << "struct GemmParams {uint64_t k; float *A; const fp16 *B;\n" "float *beta; uint64_t accum; float *C; uint64_t ldc;\n" diff --git a/test/FP16Test.cc b/test/FP16Test.cc index c346049..b5d8763 100644 --- a/test/FP16Test.cc +++ b/test/FP16Test.cc @@ -18,7 +18,7 @@ #endif using namespace std; -using namespace fbgemm2; +using namespace fbgemm; namespace { // The template parameter is transpose of A and B @@ -75,7 +75,7 @@ TEST_P(FBGemmFP16Test, Test) { aligned_vector A(m * k, 0.f); aligned_vector B(k * n, 0.f); - aligned_vector C(m * n, 0.f); + aligned_vector C(m * n, NAN); // initialize with small numbers randFill(A, 0, 4); diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc index cfde880..d961612 100644 --- a/test/I8DepthwiseTest.cc +++ b/test/I8DepthwiseTest.cc @@ -19,7 +19,7 @@ using namespace std; -namespace fbgemm2 +namespace fbgemm { // From Xray OCR @@ -445,4 +445,4 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) { } // for each shape } // Test3x3PerChannelQuantization -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/test/I8DepthwiseTest.h b/test/I8DepthwiseTest.h index 38dea6f..77db34f 100644 --- a/test/I8DepthwiseTest.h +++ b/test/I8DepthwiseTest.h @@ -8,7 +8,7 @@ #include -namespace fbgemm2 +namespace fbgemm { // From ResNeXt-3D-101 @@ -35,4 +35,4 @@ static std::vector> shapes_3d = { { 1, 8, 4, 4, 4, 1, }, }; -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/test/I8SpmdmTest.cc b/test/I8SpmdmTest.cc index cd8a94c..16dd038 100644 --- a/test/I8SpmdmTest.cc +++ b/test/I8SpmdmTest.cc @@ -23,7 +23,7 @@ #include "bench/BenchUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; std::vector densities{0.0001f, 0.001f, 0.01f, 0.1f, 1.0f}; diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc index 3ac8d28..391b993 100644 --- a/test/Im2ColFusedRequantizeTest.cc +++ b/test/Im2ColFusedRequantizeTest.cc @@ -17,54 +17,54 @@ using namespace std; -namespace fbgemm2 { +namespace fbgemm { // From Faster-RCNN with ShuffleNet static vector> shapes = { // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, pad_h, pad_w - conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {0, 0}), - conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}), - conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {0, 0}), - conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1}), - // conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(3, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}), - // conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1}), - conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1}), + conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {0, 0, 0, 0}), + conv_param_t<>(1, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {0, 0, 0, 0}), + conv_param_t<>(2, 32, 32, {14, 14}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}), + // conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {75, 100}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {109, 75}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 544, 544, {24, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 544, 544, {33, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 544, 544, {34, 50}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 544, 544, {36, 63}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 544, 544, {38, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 544, 544, {38, 40}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 544, 544, {47, 38}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(51, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(100, 1088, 1088, {7, 7}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1}), + // conv_param_t<>(1, 248, 248, {93, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 248, 248, {128, 250}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 248, 248, {133, 200}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 248, 248, {150, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 248, 248, {150, 151}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 248, 248, {150, 158}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 248, 248, {188, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 248, 248, {225, 150}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {47, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {64, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {66, 125}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {67, 100}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {75, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {75, 76}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(1, 272, 272, {94, 75}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(51, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(3, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + // conv_param_t<>(100, 544, 544, {14, 14}, 1, {3, 3}, {2, 2}, {1, 1, 1, 1}), + conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}), }; TEST(FBGemmIm2colTest, Acc32Test) { @@ -74,13 +74,40 @@ TEST(FBGemmIm2colTest, Acc32Test) { aligned_vector Bint8( conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0); aligned_vector Cint32_ref( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0.0f); - aligned_vector Cint32_fb( conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); + aligned_vector Cint8_ref(Cint32_ref.size(), 0); + aligned_vector Cint32_fb(Cint32_ref.size(), 0); + aligned_vector Cint8_fb(Cint32_ref.size(), 0); randFill(Aint8, 0, 80); int32_t Aint8_zero_point = 43; randFill(Bint8, -16, 16); + int32_t Bint8_zero_point = -30; + + float C_multiplier = 0.1234; + int32_t C_zero_pt = 5; + + int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1]; + int NDim = conv_p.OC; + int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC; + + // computing row offset + vector row_offsets(MDim); + vector Aint8_im2col(MDim * KDim); + im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data()); + row_offsets_u8acc32_ref( + MDim, KDim, KDim, Aint8_im2col.data(), row_offsets.data()); + + // computing column offset + vector col_offsets; + col_offsets.resize(NDim); + col_offsets_with_zero_pt_s8acc32_ref( + KDim, + NDim, + NDim, + Bint8.data(), + Bint8_zero_point, + col_offsets.data()); conv_ref( conv_p, @@ -89,8 +116,19 @@ TEST(FBGemmIm2colTest, Acc32Test) { Bint8.data(), Cint32_ref.data()); - int NDim = conv_p.OC; - int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC; + requantize_u8acc32_ref( + MDim, + NDim, + NDim, + Cint32_ref.data(), + Cint8_ref.data(), + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + row_offsets.data(), + col_offsets.data(), + nullptr); vector row_offset_buf; row_offset_buf.resize( @@ -102,17 +140,24 @@ TEST(FBGemmIm2colTest, Acc32Test) { PackBMatrix packedB( matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); - // no-op output process objects - DoNothing doNothing32BitObj; - memCopy<> memcopyObj(doNothing32BitObj); + DoNothing<> doNothingObj{}; + ReQuantizeOutput outputProcObj( + doNothingObj, + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + packA.getRowOffsetBuffer(), + col_offsets.data(), + nullptr); fbgemmPacked( packA, packedB, - Cint32_fb.data(), + Cint8_fb.data(), Cint32_fb.data(), NDim, - memcopyObj, + outputProcObj, 0, 1); @@ -121,11 +166,11 @@ TEST(FBGemmIm2colTest, Acc32Test) { for (int h = 0; h < conv_p.OUT_DIM[0]; ++h) { for (int w = 0; w < conv_p.OUT_DIM[1]; ++w) { for (int k = 0; k < conv_p.OC; ++k) { - int32_t expected = Cint32_ref + int32_t expected = Cint8_ref [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) * conv_p.OC + k]; - int32_t actual = Cint32_fb + int32_t actual = Cint8_fb [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) * conv_p.OC + k]; @@ -148,13 +193,40 @@ TEST(FBGemmIm2colTest, Acc16Test) { aligned_vector Bint8( conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0); aligned_vector Cint32_ref( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0.0f); - aligned_vector Cint32_fb( conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); + aligned_vector Cint8_ref(Cint32_ref.size(), 0); + aligned_vector Cint32_fb(Cint32_ref.size(), 0); + aligned_vector Cint8_fb(Cint32_ref.size(), 0); randFill(Aint8, 0, 5); int32_t Aint8_zero_point = 4; randFill(Bint8, -4, 4); + int32_t Bint8_zero_point = -2; + + float C_multiplier = 0.1234; + int32_t C_zero_pt = 5; + + int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1]; + int NDim = conv_p.OC; + int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC; + + // computing row offset + vector row_offsets(MDim); + vector Aint8_im2col(MDim * KDim); + im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data()); + row_offsets_u8acc32_ref( + MDim, KDim, KDim, Aint8_im2col.data(), row_offsets.data()); + + // computing column offset + vector col_offsets; + col_offsets.resize(NDim); + col_offsets_with_zero_pt_s8acc32_ref( + KDim, + NDim, + NDim, + Bint8.data(), + Bint8_zero_point, + col_offsets.data()); conv_ref( conv_p, @@ -163,8 +235,19 @@ TEST(FBGemmIm2colTest, Acc16Test) { Bint8.data(), Cint32_ref.data()); - int NDim = conv_p.OC; - int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC; + requantize_u8acc32_ref( + MDim, + NDim, + NDim, + Cint32_ref.data(), + Cint8_ref.data(), + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + row_offsets.data(), + col_offsets.data(), + nullptr); vector row_offset_buf; row_offset_buf.resize( @@ -176,17 +259,24 @@ TEST(FBGemmIm2colTest, Acc16Test) { PackBMatrix packedB( matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); - // no-op output process objects - DoNothing doNothing32BitObj; - memCopy<> memcopyObj(doNothing32BitObj); + DoNothing<> doNothingObj{}; + ReQuantizeOutput outputProcObj( + doNothingObj, + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + packA.getRowOffsetBuffer(), + col_offsets.data(), + nullptr); fbgemmPacked( packA, packedB, - Cint32_fb.data(), + Cint8_fb.data(), Cint32_fb.data(), NDim, - memcopyObj, + outputProcObj, 0, 1); @@ -195,11 +285,11 @@ TEST(FBGemmIm2colTest, Acc16Test) { for (int h = 0; h < conv_p.OUT_DIM[0]; ++h) { for (int w = 0; w < conv_p.OUT_DIM[1]; ++w) { for (int k = 0; k < conv_p.OC; ++k) { - int32_t expected = Cint32_ref + int32_t expected = Cint8_ref [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) * conv_p.OC + k]; - int32_t actual = Cint32_fb + int32_t actual = Cint8_fb [((n * conv_p.OUT_DIM[0] + h) * conv_p.OUT_DIM[1] + w) * conv_p.OC + k]; @@ -218,44 +308,75 @@ static vector> shapes_3d = { // MB, IC, OC, IT, IH, IW, G, KT, KH, KW, stride_t, stride_h, stride_w, // pad_t, pad_h, pad_w // conv_param_t< - // 3>(1, 3, 64, {32, 112, 112}, 1, {3, 7, 7}, {1, 2, 2}, {1, 3, 3}), + // 3>(1, 3, 64, {32, 112, 112}, 1, {3, 7, 7}, {1, 2, 2}, {1, 3, 3, 1, 3, + // 3}), // conv_param_t< - // 3>(1, 64, 64, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 64, 64, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, + // 0}), // conv_param_t< - // 3>(1, 64, 256, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 64, 256, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, + // 0}), // conv_param_t< - // 3>(1, 256, 64, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 256, 64, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, + // 0}), // conv_param_t< - // 3>(1, 256, 128, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 256, 128, {32, 56, 56}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 256, 512, {32, 56, 56}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0}), + // 3>(1, 256, 512, {32, 56, 56}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 128, 512, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 128, 512, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 512, 128, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 512, 128, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 512, 256, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 512, 256, {16, 28, 28}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 512, 1024, {16, 28, 28}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0}), + // 3>(1, 512, 1024, {16, 28, 28}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 256, 1024, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 256, 1024, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 1024, 256, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 1024, 256, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 1024, 512, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 1024, 512, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 1024, 2048, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0}), + // 3>(1, 1024, 2048, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 2048, 512, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), + // 3>(1, 2048, 512, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, + // 0, 0}), // conv_param_t< - // 3>(1, 512, 2048, {4, 7, 7}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), - + // 3>(1, 512, 2048, {4, 7, 7}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, + // 0}), + conv_param_t<3>( + 1, + 3, + 4, + {32, 112, 112}, + 1, + {3, 7, 7}, + {1, 2, 2}, + {1, 3, 3, 1, 3, 3}), + conv_param_t<3>( + 1, + 3, + 4, + {32, 112, 112}, + 1, + {3, 7, 7}, + {1, 2, 2}, + {1, 3, 3, 1, 1, 0}), conv_param_t< - 3>(1, 3, 4, {32, 112, 112}, 1, {3, 7, 7}, {1, 2, 2}, {1, 3, 3}), + 3>(1, 8, 16, {4, 7, 7}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0, 0, 0, 0}), conv_param_t< - 3>(1, 8, 16, {4, 7, 7}, 1, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}), - conv_param_t< - 3>(1, 8, 16, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0}), + 3>(1, 8, 16, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, 0, 0}), }; TEST(FBGemmIm2colTest, 3DAcc32Test) { @@ -267,17 +388,43 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) { aligned_vector Bint8( conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC, 0); aligned_vector Cint32_ref( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2] * - conv_p.OC, - 0.0f); - aligned_vector Cint32_fb( conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2] * conv_p.OC, 0); + aligned_vector Cint8_ref(Cint32_ref.size(), 0); + aligned_vector Cint32_fb(Cint32_ref.size(), 0); + aligned_vector Cint8_fb(Cint32_ref.size(), 0); randFill(Aint8, 0, 80); int32_t Aint8_zero_point = 43; randFill(Bint8, -16, 16); + int32_t Bint8_zero_point = -30; + + float C_multiplier = 0.1234; + int32_t C_zero_pt = 5; + + int MDim = + conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2]; + int NDim = conv_p.OC; + int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC; + + // computing row offset + vector row_offsets(MDim); + vector Aint8_im2col(MDim * KDim); + im2col3d_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data()); + row_offsets_u8acc32_ref( + MDim, KDim, KDim, Aint8_im2col.data(), row_offsets.data()); + + // computing column offset + vector col_offsets; + col_offsets.resize(NDim); + col_offsets_with_zero_pt_s8acc32_ref( + KDim, + NDim, + NDim, + Bint8.data(), + Bint8_zero_point, + col_offsets.data()); conv3d_ref( conv_p, @@ -286,8 +433,19 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) { Bint8.data(), Cint32_ref.data()); - int NDim = conv_p.OC; - int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC; + requantize_u8acc32_ref( + MDim, + NDim, + NDim, + Cint32_ref.data(), + Cint8_ref.data(), + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + row_offsets.data(), + col_offsets.data(), + nullptr); vector row_offset_buf; row_offset_buf.resize( @@ -297,19 +455,33 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) { conv_p, Aint8.data(), nullptr, Aint8_zero_point, row_offset_buf.data()); PackBMatrix packedB( - matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); - - // no-op output process objects - DoNothing doNothing32BitObj; - memCopy<> memcopyObj(doNothing32BitObj); + matrix_op_t::NoTranspose, + KDim, + NDim, + Bint8.data(), + NDim, + nullptr, + 1, + Bint8_zero_point); + + DoNothing<> doNothingObj{}; + ReQuantizeOutput outputProcObj( + doNothingObj, + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + packA.getRowOffsetBuffer(), + col_offsets.data(), + nullptr); fbgemmPacked( packA, packedB, - Cint32_fb.data(), + Cint8_fb.data(), Cint32_fb.data(), NDim, - memcopyObj, + outputProcObj, 0, 1); @@ -319,13 +491,13 @@ TEST(FBGemmIm2colTest, 3DAcc32Test) { for (int h = 0; h < conv_p.OUT_DIM[1]; ++h) { for (int w = 0; w < conv_p.OUT_DIM[2]; ++w) { for (int k = 0; k < conv_p.OC; ++k) { - int32_t expected = Cint32_ref + int32_t expected = Cint8_ref [(((n * conv_p.OUT_DIM[0] + t) * conv_p.OUT_DIM[1] + h) * conv_p.OUT_DIM[2] + w) * conv_p.OC + k]; - int32_t actual = Cint32_fb + int32_t actual = Cint8_fb [(((n * conv_p.OUT_DIM[0] + t) * conv_p.OUT_DIM[1] + h) * conv_p.OUT_DIM[2] + w) * @@ -355,14 +527,40 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) { conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2] * conv_p.OC, 0.0f); - aligned_vector Cint32_fb( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2] * - conv_p.OC, - 0); + aligned_vector Cint8_ref(Cint32_ref.size(), 0); + aligned_vector Cint32_fb(Cint32_ref.size(), 0); + aligned_vector Cint8_fb(Cint32_ref.size(), 0); randFill(Aint8, 0, 5); int32_t Aint8_zero_point = 4; randFill(Bint8, -4, 4); + int32_t Bint8_zero_point = -2; + + float C_multiplier = 0.1234; + int32_t C_zero_pt = 5; + + int MDim = + conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OUT_DIM[2]; + int NDim = conv_p.OC; + int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC; + + // computing row offset + vector row_offsets(MDim); + vector Aint8_im2col(MDim * KDim); + im2col3d_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data()); + row_offsets_u8acc32_ref( + MDim, KDim, KDim, Aint8_im2col.data(), row_offsets.data()); + + // computing column offset + vector col_offsets; + col_offsets.resize(NDim); + col_offsets_with_zero_pt_s8acc32_ref( + KDim, + NDim, + NDim, + Bint8.data(), + Bint8_zero_point, + col_offsets.data()); conv3d_ref( conv_p, @@ -371,8 +569,19 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) { Bint8.data(), Cint32_ref.data()); - int NDim = conv_p.OC; - int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC; + requantize_u8acc32_ref( + MDim, + NDim, + NDim, + Cint32_ref.data(), + Cint8_ref.data(), + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + row_offsets.data(), + col_offsets.data(), + nullptr); vector row_offset_buf; row_offset_buf.resize( @@ -384,17 +593,24 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) { PackBMatrix packedB( matrix_op_t::NoTranspose, KDim, NDim, Bint8.data(), NDim); - // no-op output process objects - DoNothing doNothing32BitObj; - memCopy<> memcopyObj(doNothing32BitObj); + DoNothing<> doNothingObj{}; + ReQuantizeOutput outputProcObj( + doNothingObj, + C_multiplier, + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point, + packA.getRowOffsetBuffer(), + col_offsets.data(), + nullptr); fbgemmPacked( packA, packedB, - Cint32_fb.data(), + Cint8_fb.data(), Cint32_fb.data(), NDim, - memcopyObj, + outputProcObj, 0, 1); @@ -404,13 +620,13 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) { for (int h = 0; h < conv_p.OUT_DIM[1]; ++h) { for (int w = 0; w < conv_p.OUT_DIM[2]; ++w) { for (int k = 0; k < conv_p.OC; ++k) { - int32_t expected = Cint32_ref + int32_t expected = Cint8_ref [(((n * conv_p.OUT_DIM[0] + t) * conv_p.OUT_DIM[1] + h) * conv_p.OUT_DIM[2] + w) * conv_p.OC + k]; - int32_t actual = Cint32_fb + int32_t actual = Cint8_fb [(((n * conv_p.OUT_DIM[0] + t) * conv_p.OUT_DIM[1] + h) * conv_p.OUT_DIM[2] + w) * @@ -427,4 +643,4 @@ TEST(FBGemmIm2colTest, 3DAcc16Test) { } // for each shape } // Acc16Test -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc index 28e1114..82ae96f 100644 --- a/test/PackedRequantizeAcc16Test.cc +++ b/test/PackedRequantizeAcc16Test.cc @@ -23,7 +23,7 @@ #include "TestUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; std::vector transposeVals{matrix_op_t::NoTranspose, matrix_op_t::Transpose}; diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc index 4b1b5b8..e0c9850 100644 --- a/test/PackedRequantizeTest.cc +++ b/test/PackedRequantizeTest.cc @@ -24,7 +24,7 @@ #include "TestUtils.h" using namespace std; -using namespace fbgemm2; +using namespace fbgemm; std::vector transposeVals{matrix_op_t::NoTranspose, matrix_op_t::Transpose}; diff --git a/test/QuantizationHelpers.cc b/test/QuantizationHelpers.cc index 354519b..eab08de 100644 --- a/test/QuantizationHelpers.cc +++ b/test/QuantizationHelpers.cc @@ -12,7 +12,7 @@ using namespace std; -namespace fbgemm2 { +namespace fbgemm { /* * @brief Make sure we won't have overflows from vpmaddubsw instruction. */ @@ -54,4 +54,4 @@ template void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, int8_t* B); template void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, float* B); -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/test/QuantizationHelpers.h b/test/QuantizationHelpers.h index fdc6e02..42c3e08 100644 --- a/test/QuantizationHelpers.h +++ b/test/QuantizationHelpers.h @@ -7,7 +7,7 @@ #pragma once #include -namespace fbgemm2 { +namespace fbgemm { /* * @brief Make sure we won't have overflows from vpmaddubsw instruction. @@ -15,4 +15,4 @@ namespace fbgemm2 { template void avoidOverflow(int m, int n, int k, const uint8_t* Aint8, T* B); -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/test/TestUtils.cc b/test/TestUtils.cc index 702425e..5cc14ef 100644 --- a/test/TestUtils.cc +++ b/test/TestUtils.cc @@ -9,7 +9,7 @@ #include "fbgemm/Fbgemm.h" #include "bench/AlignedVec.h" -namespace fbgemm2 { +namespace fbgemm { template int compare_validate_buffers( @@ -97,4 +97,4 @@ template void transpose_matrix(float* ref, int n, int k); template void transpose_matrix(int32_t* ref, int n, int k); template void transpose_matrix(uint8_t* ref, int n, int k); template void transpose_matrix(int8_t* ref, int n, int k); -} // namespace fbgemm2 +} // namespace fbgemm diff --git a/test/TestUtils.h b/test/TestUtils.h index 559f816..6cc365f 100644 --- a/test/TestUtils.h +++ b/test/TestUtils.h @@ -8,7 +8,7 @@ #include #include -namespace fbgemm2 { +namespace fbgemm { /* * @brief Check and validate the buffers for reference and FBGEMM result. @@ -37,4 +37,4 @@ bool check_all_zero_entries(const T* test, int m, int n); */ template void transpose_matrix(T* ref, int n, int k); -} // namespace fbgemm2 +} // namespace fbgemm -- cgit v1.2.3