diff options
-rw-r--r-- | bench/BenchUtils.cc | 51 | ||||
-rw-r--r-- | bench/BenchUtils.h | 5 | ||||
-rw-r--r-- | bench/Depthwise3DBenchmark.cc | 22 | ||||
-rw-r--r-- | bench/DepthwiseBenchmark.cc | 22 | ||||
-rw-r--r-- | bench/FP16Benchmark.cc | 34 | ||||
-rw-r--r-- | bench/I8SpmdmBenchmark.cc | 11 | ||||
-rw-r--r-- | bench/Im2ColFusedRequantizeAcc16Benchmark.cc | 37 | ||||
-rw-r--r-- | bench/Im2ColFusedRequantizeAcc32Benchmark.cc | 36 | ||||
-rw-r--r-- | bench/PackedFloatInOutBenchmark.cc | 35 | ||||
-rw-r--r-- | bench/PackedRequantizeAcc16Benchmark.cc | 125 | ||||
-rw-r--r-- | bench/PackedRequantizeAcc32Benchmark.cc | 66 | ||||
-rw-r--r-- | include/fbgemm/Fbgemm.h | 46 | ||||
-rw-r--r-- | include/fbgemm/OutputProcessing-inl.h | 173 | ||||
-rw-r--r-- | src/ExecuteKernelU8S8.cc | 284 | ||||
-rw-r--r-- | src/Fbgemm.cc | 434 | ||||
-rw-r--r-- | src/QuantUtils.cc | 5 | ||||
-rw-r--r-- | src/RefImplementations.cc | 37 | ||||
-rw-r--r-- | src/RefImplementations.h | 18 | ||||
-rw-r--r-- | test/FP16Test.cc | 20 | ||||
-rw-r--r-- | test/I8DepthwiseTest.cc | 12 | ||||
-rw-r--r-- | test/I8SpmdmTest.cc | 11 | ||||
-rw-r--r-- | test/Im2ColFusedRequantizeTest.cc | 220 | ||||
-rw-r--r-- | test/PackedRequantizeAcc16Test.cc | 418 | ||||
-rw-r--r-- | test/PackedRequantizeTest.cc | 393 |
24 files changed, 1466 insertions, 1049 deletions
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc index db40ee0..f5ce9ef 100644 --- a/bench/BenchUtils.cc +++ b/bench/BenchUtils.cc @@ -5,30 +5,41 @@ * LICENSE file in the root directory of this source tree. */ #include "BenchUtils.h" + +#include <algorithm> #include <random> +#include <type_traits> + +#include <omp.h> namespace fbgemm { std::default_random_engine eng; template <typename T> -void randFill(aligned_vector<T>& vec, const int low, const int high) { - std::random_device r; - std::uniform_int_distribution<int> dis(low, high); - for (auto& v : vec) { - v = static_cast<T>(dis(eng)); - } +void randFill(aligned_vector<T>& vec, T low, T high, std::true_type) { + std::uniform_int_distribution<T> dis(low, high); + std::generate(vec.begin(), vec.end(), [&] { return dis(eng); }); +} + +template <typename T> +void randFill(aligned_vector<T>& vec, T low, T high, std::false_type) { + std::uniform_real_distribution<T> dis(low, high); + std::generate(vec.begin(), vec.end(), [&] { return dis(eng); }); +} + +template <typename T> +void randFill(aligned_vector<T>& vec, T low, T high) { + randFill(vec, low, high, std::is_integral<T>()); } template void -randFill<float>(aligned_vector<float>& vec, const int low, const int high); -template void -randFill<uint8_t>(aligned_vector<uint8_t>& vec, const int low, const int high); +randFill<float>(aligned_vector<float>& vec, float low, float high); template void -randFill<int8_t>(aligned_vector<int8_t>& vec, const int low, const int high); - +randFill<uint8_t>(aligned_vector<uint8_t>& vec, uint8_t low, uint8_t high); template void -randFill<int>(aligned_vector<int>& vec, const int low, const int high); +randFill<int8_t>(aligned_vector<int8_t>& vec, int8_t low, int8_t high); +template void randFill<int>(aligned_vector<int>& vec, int low, int high); void llc_flush(std::vector<char>& llc) { volatile char* data = llc.data(); @@ -37,4 +48,20 @@ void llc_flush(std::vector<char>& llc) { } } +int fbgemm_get_num_threads() { +#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP) + return 1; +#else + return omp_get_num_threads(); +#endif +} + +int fbgemm_get_thread_num() { +#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP) + return 0; +#else + return omp_get_thread_num(); +#endif +} + } // namespace fbgemm diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h index 8ca99df..da2ef2d 100644 --- a/bench/BenchUtils.h +++ b/bench/BenchUtils.h @@ -11,8 +11,11 @@ namespace fbgemm { template <typename T> -void randFill(aligned_vector<T>& vec, const int low, const int high); +void randFill(aligned_vector<T>& vec, T low, T high); void llc_flush(std::vector<char>& llc); +int fbgemm_get_num_threads(); +int fbgemm_get_thread_num(); + } // namespace fbgemm diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc index f53eeea..c65839b 100644 --- a/bench/Depthwise3DBenchmark.cc +++ b/bench/Depthwise3DBenchmark.cc @@ -62,10 +62,10 @@ int main() { aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K), C(C_ref.size()); - randFill(A, 0, 86); + randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; - randFill(B, -16, 16); + randFill<int8_t>(B, -16, 16); int32_t B_zero_point = 5; depthwise_3x3x3_pad_1_ref( @@ -129,13 +129,8 @@ int main() { t_begin = chrono::system_clock::now(); #pragma omp parallel { -#if _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); depthwise_3x3x3_pad_1( N, T, @@ -200,13 +195,8 @@ int main() { t_begin = chrono::system_clock::now(); #pragma omp parallel { -#if _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); depthwise_3x3x3_pad_1( N, T, diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc index 8e6d83d..b922f90 100644 --- a/bench/DepthwiseBenchmark.cc +++ b/bench/DepthwiseBenchmark.cc @@ -161,10 +161,10 @@ int main() { aligned_vector<int8_t> B(G * R * S); aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size()); - randFill(A, 0, 86); + randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; - randFill(B, -16, 16); + randFill<int8_t>(B, -16, 16); int32_t B_zero_point = 5; depthwise_3x3_pad_1_ref( @@ -221,13 +221,8 @@ int main() { t_begin = chrono::system_clock::now(); #pragma omp parallel { -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); depthwise_3x3_pad_1( N, H, @@ -279,13 +274,8 @@ int main() { t_begin = chrono::system_clock::now(); #pragma omp parallel { -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); depthwise_3x3_pad_1( N, H, diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc index c03f18a..fd9de5b 100644 --- a/bench/FP16Benchmark.cc +++ b/bench/FP16Benchmark.cc @@ -73,20 +73,24 @@ void performance_test() { int n = s[1]; int k = s[2]; - aligned_vector<float> A(m * k, 0.f); - aligned_vector<float> B(k * n, 0.f); - aligned_vector<float> Cg(m * n, 1.f); - aligned_vector<float> Cp(m * n, NAN); + aligned_vector<float> C_ref(m * n, 1.f); + aligned_vector<float> C_fb(m * n, NAN); // initialize with small numbers - randFill(A, 0, 4); + aligned_vector<int> Aint(m * k); + randFill(Aint, 0, 4); + aligned_vector<float> A(Aint.begin(), Aint.end()); - randFill(B, 0, 4); + aligned_vector<int> Bint(k * n); + randFill(Bint, 0, 4); + aligned_vector<float> B(Bint.begin(), Bint.end()); PackedGemmMatrixFP16 Bp(btran, k, n, alpha, B.data()); if (beta != 0.0f) { - randFill(Cg, 0, 4); - Cp = Cg; + aligned_vector<int> Cint(C_ref.size()); + randFill(Cint, 0, 4); + C_ref.assign(Cint.begin(), Cint.end()); + C_fb = C_ref; } double nflops = 2.0 * (double)m * (double)n * (double)k * (double)NITER; @@ -111,17 +115,17 @@ void performance_test() { B.data(), (btran == matrix_op_t::NoTranspose) ? n : k, beta, - Cg.data(), + C_ref.data(), n); #endif cblas_gemm_compute( - matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data()); + matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data()); #ifdef USE_MKL // Compare results - for (auto i = 0; i < Cg.size(); i++) { - // printf("%f %f\n", Cg[i], Cp[i]); - assert(std::abs(Cg[i] - Cp[i]) < 1e-3); + for (auto i = 0; i < C_ref.size(); i++) { + // printf("%f %f\n", C_ref[i], C_fb[i]); + assert(std::abs(C_ref[i] - C_fb[i]) < 1e-3); } #endif } @@ -151,7 +155,7 @@ void performance_test() { B.data(), (btran == matrix_op_t::NoTranspose) ? n : k, beta, - Cg.data(), + C_ref.data(), n); t_end = chrono::system_clock::now(); if (it >= 0) { @@ -184,7 +188,7 @@ void performance_test() { t_begin = chrono::system_clock::now(); cblas_gemm_compute( - matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data()); + matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data()); t_end = chrono::system_clock::now(); if (it >= 0) { diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc index 07b73dc..4223d0c 100644 --- a/bench/I8SpmdmBenchmark.cc +++ b/bench/I8SpmdmBenchmark.cc @@ -77,7 +77,7 @@ int main() { cout << M << ", " << N << ", " << K << ", "; aligned_vector<uint8_t> A(M * K); - randFill(A, 0, 255); + randFill<uint8_t>(A, 0, 255); fbgemm::CompressedSparseColumn B_csc(K, N); vector<int32_t> C(M * N); @@ -156,13 +156,8 @@ int main() { #pragma omp parallel #endif { -#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP) - int num_threads = 1; - int tid = 0; -#else - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); int i_per_thread = ((M + 31) / 32 + num_threads - 1) / num_threads * 32; int i_begin = std::min(tid * i_per_thread, M); diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc index 2115863..cb2edf5 100644 --- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc +++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc @@ -125,43 +125,29 @@ void performance_test() { chrono::time_point<chrono::high_resolution_clock> begin, end; for (auto conv_p : shapes) { - aligned_vector<float> Afp32( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f); aligned_vector<uint8_t> Aint8( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0); - + conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC); aligned_vector<uint8_t> Aint8_out( conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] * - conv_p.K[1] * conv_p.IC, - 0); + conv_p.K[1] * conv_p.IC); - aligned_vector<float> Bfp32( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f); aligned_vector<int8_t> Bint8( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0); + conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC); aligned_vector<int32_t> Cint32_ref( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); - - aligned_vector<int32_t> Cint32_fb( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); - - aligned_vector<int32_t> Cint32_fb2( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); + conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size()); // A matrix (input activations) - randFill(Afp32, 0, 5); + randFill<uint8_t>(Aint8, 0, 5); int32_t Aint8_zero_point = 4; - for (auto i = 0; i < Afp32.size(); ++i) { - Aint8[i] = static_cast<uint8_t>(Afp32[i]); - } + aligned_vector<float> Afp32(Aint8.begin(), Aint8.end()); // B matrix (weights) - randFill(Bfp32, -4, 4); + randFill<int8_t>(Bint8, -4, 4); // int32_t Bint8_zero_point = -3; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bint8[i] = static_cast<int8_t>(Bfp32[i]); - } + aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end()); // reference implementation conv_ref( @@ -184,8 +170,7 @@ void performance_test() { double ttot = 0.0; string runType; - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize()); PackAWithIm2Col<uint8_t, int16_t> packA( diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc index 7144e61..8e112d8 100644 --- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc +++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc @@ -125,45 +125,32 @@ void performance_test() { chrono::time_point<chrono::high_resolution_clock> begin, end; for (auto conv_p : shapes) { - aligned_vector<float> Afp32( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f); aligned_vector<uint8_t> Aint8( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0); + conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC); aligned_vector<uint8_t> Aint8_out( conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] * - conv_p.K[1] * conv_p.IC, - 0); + conv_p.K[1] * conv_p.IC); - aligned_vector<float> Bfp32( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f); aligned_vector<int8_t> Bint8( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0); + conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC); aligned_vector<int32_t> Cint32_ref( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); - - aligned_vector<int32_t> Cint32_fb( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); - - aligned_vector<int32_t> Cint32_fb2( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); + conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size()); // cout << conv_p.toString() << endl; // A matrix (input activations) - randFill(Afp32, 0, 5); + randFill<uint8_t>(Aint8, 0, 5); int32_t Aint8_zero_point = 4; - for (auto i = 0; i < Afp32.size(); ++i) { - Aint8[i] = static_cast<uint8_t>(Afp32[i]); - } + aligned_vector<float> Apf32(Aint8.begin(), Aint8.end()); // B matrix (weights) - randFill(Bfp32, -4, 4); + randFill<int8_t>(Bint8, -4, 4); // int32_t Bint8_zero_point = -3; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bint8[i] = static_cast<int8_t>(Bfp32[i]); - } + aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end()); // reference implementation conv_ref( @@ -186,8 +173,7 @@ void performance_test() { double ttot = 0.0; string runType; - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize()); PackAWithIm2Col<uint8_t, int32_t> packA( diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc index badbda0..79a750e 100644 --- a/bench/PackedFloatInOutBenchmark.cc +++ b/bench/PackedFloatInOutBenchmark.cc @@ -86,27 +86,27 @@ void performance_test() { int k = shape[2]; float alpha = 1.f, beta = 0.f; - aligned_vector<float> Afp32(m * k, 0.0f); - aligned_vector<uint8_t> Aint8(m * k, 0); + aligned_vector<float> Afp32(m * k); + aligned_vector<uint8_t> Aint8(Afp32.size()); - aligned_vector<float> Bfp32(k * n, 0.0f); - aligned_vector<int8_t> Bint8(k * n, 0); + aligned_vector<float> Bfp32(k * n); + aligned_vector<int8_t> Bint8(Bfp32.size()); - aligned_vector<float> Cfp32_mkl(m * n, 0.0f); - aligned_vector<float> Cfp32_fb(m * n, 0.0f); + aligned_vector<float> Cfp32_mkl(m * n); + aligned_vector<float> Cfp32_fb(Cfp32_mkl.size()); - aligned_vector<uint8_t> Cint8_fb(m * n, 0); - aligned_vector<int32_t> Cint32_buffer(m * n, 0); + aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size()); + aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size()); // A matrix - randFill(Aint8, 0, 255); + randFill<uint8_t>(Aint8, 0, 255); float Aint8_scale = 0.11; int32_t Aint8_zero_point = 43; for (auto i = 0; i < Afp32.size(); ++i) { Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point); } - randFill(Bint8, -128, 127); + randFill<int8_t>(Bint8, -128, 127); avoidOverflow(m, n, k, Aint8.data(), Bint8.data()); float Bint8_scale = 0.49; @@ -116,10 +116,9 @@ void performance_test() { } // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(n); + vector<int32_t> col_offsets(n); col_offsets_with_zero_pt_s8acc32_ref( - k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data()); + k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n); double ttot = 0; std::string type; @@ -172,8 +171,7 @@ void performance_test() { // printMatrix(matrix_op_t::NoTranspose, col_offsets.data(), 1, n, n, "col // offsets before"); - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize()); PackAWithQuantRowOffset<uint8_t> packAN( @@ -201,12 +199,13 @@ void performance_test() { ReQuantizeForFloat<false> outputProcObj( doNothingObj, Aint8_scale, - Bint8_scale, + &Bint8_scale, Aint8_zero_point, - Bint8_zero_point, + &Bint8_zero_point, packAN.getRowOffsetBuffer(), col_offsets.data(), - nullptr); + nullptr, + n); ttot = 0; type = "FBGEMM_i8_acc32"; diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc index fd48b24..f60332f 100644 --- a/bench/PackedRequantizeAcc16Benchmark.cc +++ b/bench/PackedRequantizeAcc16Benchmark.cc @@ -100,29 +100,26 @@ void performance_test() { int n = shape[1]; int k = shape[2]; - float alpha = 1.f, beta = 0.f; - aligned_vector<float> Afp32(m * k, 0.0f); - aligned_vector<uint8_t> Aint8(m * k, 0); + float alpha = 1.0f, beta = 0.0f; + aligned_vector<uint8_t> Aint8(m * k); + aligned_vector<int8_t> Bint8(k * n); - aligned_vector<float> Bfp32(k * n, 0.0f); - aligned_vector<int8_t> Bint8(k * n, 0); - - aligned_vector<float> Cfp32_mkl(m * n, 0.0f); + aligned_vector<float> Cfp32_mkl(m * n); // just used for result comparisons - aligned_vector<int32_t> Cint32_mkl(m * n, 0.0f); + aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size()); // requantize results - aligned_vector<uint8_t> Cint8_mkl(m * n, 0.0f); - aligned_vector<int32_t> Cint32_fb(m * n, 0.0f); - aligned_vector<uint8_t> Cint8_fb(m * n, 0.0f); + aligned_vector<uint8_t> Cint8_mkl(Cfp32_mkl.size()); + aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size()); + aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size()); // A matrix - randFill(Afp32, 0, 50); + randFill<uint8_t>(Aint8, 0, 50); int32_t Aint8_zero_point = 43; - for (auto i = 0; i < Afp32.size(); ++i) { - Aint8[i] = static_cast<uint8_t>(Afp32[i]); - } + aligned_vector<float> Afp32(Aint8.begin(), Aint8.end()); - randFill(Bfp32, -8, 8); + randFill<int8_t>(Bint8, -8, 8); + aligned_vector<int8_t> Bint8_copy(Bint8); + aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end()); double nops = 2.0 * static_cast<double>(NITER) * m * n * k; double ttot = 0.0; @@ -163,9 +160,7 @@ void performance_test() { cout << setw(16) << runType << ", " << fixed << setw(5) << setprecision(1) << nops / ttot << endl; - for (auto i = 0; i < Cfp32_mkl.size(); ++i) { - Cint32_mkl[i] = static_cast<int32_t>(Cfp32_mkl[i]); - } + Cint32_mkl.assign(Cfp32_mkl.begin(), Cfp32_mkl.end()); #endif for (BenchmarkType bench_type : @@ -179,23 +174,19 @@ void performance_test() { bench_type == BenchmarkType::REQUANTIZATION) ? 0 : -30; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bint8[i] = static_cast<int8_t>(Bfp32[i]); - } // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(n); + vector<int32_t> col_offsets(n); + Bint8 = Bint8_copy; col_offsets_with_zero_pt_s8acc32_ref( - k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data()); + k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n); - vector<int32_t> row_offsets; - row_offsets.resize(m); + vector<int32_t> row_offsets(m); row_offsets_u8acc32_ref(m, k, k, Aint8.data(), row_offsets.data()); float C_multiplier = - (bench_type == BenchmarkType::BARE_BONE) ? 1 : 0.1234; + (bench_type == BenchmarkType::BARE_BONE) ? 1.0f : 0.1234f; int32_t C_zero_pt = (bench_type == BenchmarkType::BARE_BONE) ? 0 : 5; // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k, @@ -235,16 +226,14 @@ void performance_test() { n, Cint32_mkl.data(), Cint8_mkl.data(), - C_multiplier, + &C_multiplier, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + &Bint8_zero_point, row_offsets.data(), col_offsets.data(), - nullptr); // bias - - PackBMatrix<int8_t, int16_t> packedB( - matrix_op_t::NoTranspose, k, n, Bint8.data(), n); + nullptr, // bias + n); // ncols per quant group CompressedSparseColumn B_csc(k, n); @@ -254,30 +243,35 @@ void performance_test() { default_random_engine eng; binomial_distribution<> per_col_nnz_dist(k, density); - vector<int> row_indices(k); - - int total_nnz = 0; - for (int j = 0; j < n; ++j) { - B_csc.ColPtr()[j] = total_nnz; - - int nnz_of_j = per_col_nnz_dist(eng); - total_nnz += nnz_of_j; - - iota(row_indices.begin(), row_indices.end(), 0); - shuffle(row_indices.begin(), row_indices.end(), eng); - sort(row_indices.begin(), row_indices.begin() + nnz_of_j); - - for (int kidx = 0; kidx < nnz_of_j; ++kidx) { - B_csc.RowIdx().push_back(row_indices[kidx]); - // put the current B value - B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]); - // make current B value zero - Bint8[row_indices[kidx] * n + j] = 0; - // std::cout << "(" << row_indices[kidx] << ", " << j << ")" << - // endl; + if (bench_type == BenchmarkType::EVERYTHING) { + vector<int> row_indices(k); + + int total_nnz = 0; + for (int j = 0; j < n; ++j) { + B_csc.ColPtr()[j] = total_nnz; + + int nnz_of_j = per_col_nnz_dist(eng); + total_nnz += nnz_of_j; + + iota(row_indices.begin(), row_indices.end(), 0); + shuffle(row_indices.begin(), row_indices.end(), eng); + sort(row_indices.begin(), row_indices.begin() + nnz_of_j); + + for (int kidx = 0; kidx < nnz_of_j; ++kidx) { + B_csc.RowIdx().push_back(row_indices[kidx]); + // put the current B value + B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]); + // make current B value zero + Bint8[row_indices[kidx] * n + j] = 0; + // std::cout << "(" << row_indices[kidx] << ", " << j << ")" << + // endl; + } } + B_csc.ColPtr()[n] = total_nnz; } - B_csc.ColPtr()[n] = total_nnz; + + PackBMatrix<int8_t, int16_t> packedB( + matrix_op_t::NoTranspose, k, n, Bint8.data(), n); // printMatrix(matrix_op_t::NoTranspose, // Cint32_mkl.data(), m, n, n, "C mkl"); @@ -298,8 +292,7 @@ void performance_test() { #pragma omp parallel #endif { - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); PackAMatrix<uint8_t, int16_t> packA( @@ -333,15 +326,16 @@ void performance_test() { // Requantization back to int8 ReQuantizeOutput<false> reqObj( doNothingObj, - C_multiplier, + &C_multiplier, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + &Bint8_zero_point, bench_type == BenchmarkType::REQUANTIZATION ? nullptr : packAWithRowOffset.getRowOffsetBuffer(), col_offsets.data(), - nullptr); + nullptr, + n); // the top most (first) operation in the output processing // pipeline is spmdm @@ -354,13 +348,8 @@ void performance_test() { ReQuantizeOutput<false>> spmdmObj(reqObj, Aint8.data(), k, B_csc); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); // printf ( "tid: %d, num_threads: %d\n", tid, num_threads ); switch (bench_type) { case BenchmarkType::BARE_BONE: diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc index b3a9b38..b255b8c 100644 --- a/bench/PackedRequantizeAcc32Benchmark.cc +++ b/bench/PackedRequantizeAcc32Benchmark.cc @@ -103,42 +103,35 @@ void performance_test() { int k = shape[2]; float alpha = 1.f, beta = 0.f; - aligned_vector<float> Afp32(m * k, 0.0f); - aligned_vector<uint8_t> Aint8(m * k, 0); + aligned_vector<uint8_t> Aint8(m * k); - aligned_vector<float> Bfp32(k * n, 0.0f); - aligned_vector<int8_t> Bint8(k * n, 0); + aligned_vector<int8_t> Bint8(k * n); - aligned_vector<float> Cfp32_mkl(m * n, 0.0f); - aligned_vector<int32_t> Cint32_mkl(m * n, 0.0f); - aligned_vector<int32_t> Cint32_fb(m * n, 0); - aligned_vector<uint8_t> Cint8_fb(m * n, 0); - aligned_vector<int32_t> Cint32_local(m * n, 0); - aligned_vector<int32_t> Cint32_buffer(m * n, 0); - aligned_vector<uint8_t> Cint8_local(m * n, 0); + aligned_vector<float> Cfp32_mkl(m * n); + aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size()); + aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size()); + aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size()); + aligned_vector<int32_t> Cint32_local(Cfp32_mkl.size()); + aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size()); + aligned_vector<uint8_t> Cint8_local(Cfp32_mkl.size()); // A matrix - randFill(Aint8, 0, 255); + randFill<uint8_t>(Aint8, 0, 255); // float Aint8_scale = 0.11; int32_t Aint8_zero_point = 43; - for (auto i = 0; i < Afp32.size(); ++i) { - Afp32[i] = (float)Aint8[i]; - } + aligned_vector<float> Afp32(Aint8.begin(), Aint8.end()); - randFill(Bint8, -128, 127); + randFill<int8_t>(Bint8, -128, 127); avoidOverflow(m, n, k, Aint8.data(), Bint8.data()); // float Bint8_scale = 0.49; int32_t Bint8_zero_point = -30; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bfp32[i] = (float)Bint8[i]; - } + aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end()); // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(n); + vector<int32_t> col_offsets(n); col_offsets_with_zero_pt_s8acc32_ref( - k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data()); + k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n); double nops = 2.0 * static_cast<double>(NITER) * m * n * k; double ttot = 0.0; @@ -180,8 +173,7 @@ void performance_test() { } #endif - vector<int32_t> row_offsets; - row_offsets.resize(m); + vector<int32_t> row_offsets(m); float C_multiplier = 0.1234; int32_t C_zero_pt = 5; @@ -197,13 +189,14 @@ void performance_test() { n, Cint32_local.data(), Cint8_local.data(), - C_multiplier, + &C_multiplier, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + &Bint8_zero_point, row_offsets.data(), col_offsets.data(), - nullptr); // bias + nullptr, // bias + n); // ncols per quant group // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B // unpacked"); // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k, @@ -248,8 +241,7 @@ void performance_test() { #pragma omp parallel #endif { - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithRowOffset<uint8_t>::rowOffsetBufferSize()); PackAWithRowOffset<uint8_t> packAN( @@ -265,21 +257,17 @@ void performance_test() { DoNothing<> doNothingObj{}; ReQuantizeOutput<false> outputProcObj( doNothingObj, - C_multiplier, + &C_multiplier, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + &Bint8_zero_point, packAN.getRowOffsetBuffer(), col_offsets.data(), - nullptr); + nullptr, + n); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); // printf ( "tid: %d, num_threads: %d\n", tid, num_threads ); fbgemmPacked( packAN, diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index d2c2a1e..17a07e5 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -847,13 +847,19 @@ class DoSpmdmOnInpBuffer { const int groups_; }; +enum class QuantizationGranularity { + TENSOR, + GROUP, + OUT_CHANNEL, +}; + /** * @brief Requantize values in inp buffer and write to out buffer. * pass the out buffer to next op for further processing. - * */ template < bool FUSE_RELU, + QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR, typename outT = std::uint8_t, typename inT = std::int32_t, typename nextOPType = DoNothing<outT, outT>> @@ -863,13 +869,15 @@ class ReQuantizeOutput { using inpType = inT; ReQuantizeOutput( nextOPType& nextop, - float C_multiplier, + const float* C_multiplier, std::int32_t C_zero_point, std::int32_t Aq_zero_point, - std::int32_t Bq_zero_point, + const std::int32_t* Bq_zero_point, const std::int32_t* row_offsets, const std::int32_t* col_offsets, - const std::int32_t* bias) + const std::int32_t* bias, + std::uint32_t nCol, + int groups = 1) : nextop_(nextop), C_multiplier_(C_multiplier), C_zero_point_(C_zero_point), @@ -877,7 +885,9 @@ class ReQuantizeOutput { Bq_zero_point_(Bq_zero_point), q_row_offsets_(row_offsets), q_col_offsets_(col_offsets), - bias_(bias) {} + bias_(bias), + ncols_(nCol), + groups_(groups) {} template <inst_set_t instSet> inline int f( @@ -897,13 +907,15 @@ class ReQuantizeOutput { int ld_in) const; nextOPType& nextop_; - float C_multiplier_; + const float* C_multiplier_; std::int32_t C_zero_point_; std::int32_t Aq_zero_point_; - std::int32_t Bq_zero_point_; + const std::int32_t* Bq_zero_point_; const std::int32_t* q_row_offsets_; const std::int32_t* q_col_offsets_; const std::int32_t* bias_; + std::uint32_t ncols_; + int groups_; }; /** @@ -912,6 +924,7 @@ class ReQuantizeOutput { */ template < bool FUSE_RELU, + QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR, typename outT = float, typename inT = std::int32_t, typename nextOPType = DoNothing<outT, outT>> @@ -922,12 +935,14 @@ class ReQuantizeForFloat { ReQuantizeForFloat( nextOPType& nextop, float Aq_scale, - float Bq_scale, + const float* Bq_scale, std::int32_t Aq_zero_point, - std::int32_t Bq_zero_point, + const std::int32_t* Bq_zero_point, const std::int32_t* row_offsets, const std::int32_t* col_offsets, - const float* bias) + const float* bias, + std::uint32_t nCol, + int groups = 1) : nextop_(nextop), Aq_scale_(Aq_scale), Bq_scale_(Bq_scale), @@ -935,7 +950,9 @@ class ReQuantizeForFloat { Bq_zero_point_(Bq_zero_point), q_row_offsets_(row_offsets), q_col_offsets_(col_offsets), - bias_(bias) {} + bias_(bias), + ncols_(nCol), + groups_(groups) {} template <inst_set_t instSet> inline int f( @@ -947,12 +964,15 @@ class ReQuantizeForFloat { private: nextOPType& nextop_; - float Aq_scale_, Bq_scale_; + float Aq_scale_; + const float* Bq_scale_; std::int32_t Aq_zero_point_; - std::int32_t Bq_zero_point_; + const std::int32_t* Bq_zero_point_; const std::int32_t* q_row_offsets_; const std::int32_t* q_col_offsets_; const float* bias_; + std::uint32_t ncols_; + int groups_; }; // type specialized implementation in an include file diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h index 59a6e0e..88a10bc 100644 --- a/include/fbgemm/OutputProcessing-inl.h +++ b/include/fbgemm/OutputProcessing-inl.h @@ -44,9 +44,14 @@ inline int DoSpmdmOnInpBuffer<outT, inT, nextOPType>::f( return nextop_.template f<instSet>(out, inp, block, ld_out, ld_in); } -template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType> +template < + bool FUSE_RELU, + QuantizationGranularity Q_GRAN, + typename outT, + typename inT, + typename nextOPType> template <bool A_SYMMETRIC, bool B_SYMMETRIC, bool HAS_BIAS> -void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_( +void ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f_( outT* out, const inT* inp, const block_type_t& block, @@ -54,7 +59,13 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_( int ld_in) const { // Adoption of implementation at QNNPACK/src/requantization/fp32-sse2.c // using AVX2 instructions - __m256 multiplier_v = _mm256_set1_ps(C_multiplier_); + int quant_param_idx = 0; + if (Q_GRAN == QuantizationGranularity::GROUP) { + int ncol_per_group = ncols_ / groups_; + int g = block.col_start / ncol_per_group; + quant_param_idx = g; + } + __m256 multiplier_v = _mm256_set1_ps(C_multiplier_[quant_param_idx]); __m256i min_v = _mm256_set1_epi8(std::numeric_limits<uint8_t>::min()); __m256i max_v = _mm256_set1_epi8(std::numeric_limits<uint8_t>::max()); @@ -63,7 +74,9 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_( (A_SYMMETRIC == (Aq_zero_point_ == 0)) && "A_SYMMETRIC == true if and only if Aq_zero_point == 0"); assert( - (B_SYMMETRIC == (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr)) && + (B_SYMMETRIC == + ((Q_GRAN == QuantizationGranularity::TENSOR && Bq_zero_point_[0] == 0) || + q_row_offsets_ == nullptr)) && "B_SYMMETRIC == true if and only if Bq_zero_point == 0 " "or q_row_offsets_ == nullptr"); assert( @@ -79,10 +92,22 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_( constexpr int VLEN = 8; for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { - std::int32_t row_offset = (q_row_offsets_ && !B_SYMMETRIC) - ? q_row_offsets_[i - block.row_start] * Bq_zero_point_ - : 0; + // Scale row_offset with Bq_zero_point + int32_t row_offset = 0; + if (B_SYMMETRIC) { + row_offset = 0; + } else if ( + Q_GRAN == QuantizationGranularity::TENSOR || + Q_GRAN == QuantizationGranularity::GROUP) { + row_offset = + q_row_offsets_[i - block.row_start] * Bq_zero_point_[quant_param_idx]; + } else { + assert( + Q_GRAN == QuantizationGranularity::OUT_CHANNEL && + "unknown quantization granularity"); + } __m256i row_offset_v = _mm256_set1_epi32(row_offset); + int j = block.col_start; for (; j < block.col_start + (block.col_size / (VLEN * 4) * (VLEN * 4)); j += (VLEN * 4)) { @@ -122,9 +147,33 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_( } if (!B_SYMMETRIC) { + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(q_row_offsets_[i - block.row_start]), + _mm256_loadu_si256( + reinterpret_cast<const __m256i*>(Bq_zero_point_ + j))); + } x_v = _mm256_sub_epi32(x_v, row_offset_v); + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(q_row_offsets_[i - block.row_start]), + _mm256_loadu_si256( + reinterpret_cast<const __m256i*>(Bq_zero_point_ + j + VLEN))); + } y_v = _mm256_sub_epi32(y_v, row_offset_v); + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(q_row_offsets_[i - block.row_start]), + _mm256_loadu_si256(reinterpret_cast<const __m256i*>( + Bq_zero_point_ + j + 2 * VLEN))); + } z_v = _mm256_sub_epi32(z_v, row_offset_v); + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(q_row_offsets_[i - block.row_start]), + _mm256_loadu_si256(reinterpret_cast<const __m256i*>( + Bq_zero_point_ + j + 3 * VLEN))); + } w_v = _mm256_sub_epi32(w_v, row_offset_v); } if (HAS_BIAS) { @@ -157,10 +206,24 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_( * representation as an FP32 value, and will be rounded to nearest * FP32 value with ties to even with default MXCSR rounding mode. */ - __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); - __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v); - __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v); - __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v); + __m256 x_scaled_v, y_scaled_v, z_scaled_v, w_scaled_v; + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + x_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(x_v), _mm256_loadu_ps(C_multiplier_ + j)); + y_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(y_v), _mm256_loadu_ps(C_multiplier_ + j + VLEN)); + z_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(z_v), + _mm256_loadu_ps(C_multiplier_ + j + 2 * VLEN)); + w_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(w_v), + _mm256_loadu_ps(C_multiplier_ + j + 3 * VLEN)); + } else { + x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); + y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v); + z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v); + w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v); + } /* * Convert scaled FP32 result to int32_t using CVTPS2DQ instruction. @@ -238,6 +301,12 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_( } if (!B_SYMMETRIC) { + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(q_row_offsets_[i - block.row_start]), + _mm256_loadu_si256( + reinterpret_cast<const __m256i*>(Bq_zero_point_ + j))); + } x_v = _mm256_sub_epi32(x_v, row_offset_v); } if (HAS_BIAS) { @@ -246,7 +315,13 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_( _mm256_loadu_si256(reinterpret_cast<const __m256i*>(bias_ + j))); } - __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); + __m256 x_scaled_v; + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + x_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(x_v), _mm256_loadu_ps(C_multiplier_ + j)); + } else { + x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); + } __m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v); __m256i x_packed_v = _mm256_adds_epi16( @@ -281,19 +356,26 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_( _mm256_castsi256_si128(x_clamped_v)); } // j loop vectorized + // TODO: vectorize remainder using masking for (; j < block.col_start + block.col_size; ++j) { int32_t raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)]; if (!A_SYMMETRIC) { raw -= Aq_zero_point_ * q_col_offsets_[j]; } if (!B_SYMMETRIC) { + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset = q_row_offsets_[i - block.row_start] * Bq_zero_point_[j]; + } raw -= row_offset; } if (HAS_BIAS) { raw += bias_[j]; } - float ab = raw * C_multiplier_; + float ab = raw * + ((Q_GRAN == QuantizationGranularity::OUT_CHANNEL) + ? C_multiplier_[j] + : C_multiplier_[quant_param_idx]); long rounded = std::lrintf(ab) + C_zero_point_; out[i * ld_out + j] = std::max( @@ -303,9 +385,14 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_( } // i loop } -template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType> +template < + bool FUSE_RELU, + QuantizationGranularity Q_GRAN, + typename outT, + typename inT, + typename nextOPType> template <inst_set_t instSet> -inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f( +inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f( outT* out, const inT* inp, const block_type_t& block, @@ -314,19 +401,35 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f( static_assert( std::is_same<inT, int32_t>::value, "input data type must be of int32_t type"); + int ncol_per_group = ncols_ / groups_; + assert( + block.col_size <= ncol_per_group && + "ReQuantizeOutput should be called at most 1 group at a time."); + int g = block.col_start / ncol_per_group; if (instSet == inst_set_t::anyarch) { for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)]; raw -= Aq_zero_point_ * q_col_offsets_[j]; + int Bq_zero_point_idx; + if (Q_GRAN == QuantizationGranularity::TENSOR) { + Bq_zero_point_idx = 0; + } else if (Q_GRAN == QuantizationGranularity::GROUP) { + Bq_zero_point_idx = g; + } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + Bq_zero_point_idx = j; + } else { + assert(false && "unknown quantization granularity"); + } if (q_row_offsets_) { - raw -= q_row_offsets_[i - block.row_start] * Bq_zero_point_; + raw -= q_row_offsets_[i - block.row_start] * + Bq_zero_point_[Bq_zero_point_idx]; } if (bias_) { raw += bias_[j]; } - float ab = raw * C_multiplier_; + float ab = raw * C_multiplier_[Bq_zero_point_idx]; long rounded = std::lrintf(ab) + C_zero_point_; out[i * ld_out + j] = std::max( @@ -336,8 +439,11 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f( } } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) { if (std::is_same<outT, uint8_t>::value) { + bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR && + Bq_zero_point_[0] == 0) || + q_row_offsets_ == nullptr; if (Aq_zero_point_ == 0) { - if (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr) { + if (b_symmetric) { if (bias_ == nullptr) { f_<true, true, false>(out, inp, block, ld_out, ld_in); } else { @@ -351,7 +457,7 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f( } } } else { - if (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr) { + if (b_symmetric) { if (bias_ == nullptr) { f_<false, true, false>(out, inp, block, ld_out, ld_in); } else { @@ -374,9 +480,14 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f( return nextop_.template f<instSet>(out, out, block, ld_out, ld_out); } -template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType> +template < + bool FUSE_RELU, + QuantizationGranularity Q_GRAN, + typename outT, + typename inT, + typename nextOPType> template <inst_set_t instSet> -inline int ReQuantizeForFloat<FUSE_RELU, outT, inT, nextOPType>::f( +inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f( outT* out, inT* inp, const block_type_t& block, @@ -388,12 +499,28 @@ inline int ReQuantizeForFloat<FUSE_RELU, outT, inT, nextOPType>::f( static_assert( std::is_same<float, outT>::value, "output data type is of not expected type"); + int ncol_per_group = ncols_ / groups_; + assert( + block.col_size <= ncol_per_group && + "ReQuantizeOutput should be called at most 1 group at a time."); + int g = block.col_start / ncol_per_group; for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start]; raw -= Aq_zero_point_ * q_col_offsets_[j]; - raw -= q_row_offsets_[i - block.row_start] * Bq_zero_point_; - float res = raw * Aq_scale_ * Bq_scale_; + int Bq_zero_point_idx; + if (Q_GRAN == QuantizationGranularity::TENSOR) { + Bq_zero_point_idx = 0; + } else if (Q_GRAN == QuantizationGranularity::GROUP) { + Bq_zero_point_idx = g; + } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + Bq_zero_point_idx = j; + } else { + assert(false && "unknown quantization granularity"); + } + raw -= q_row_offsets_[i - block.row_start] * + Bq_zero_point_[Bq_zero_point_idx]; + float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx]; if (bias_) { res += bias_[j]; } diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 2e2035c..f1ec882 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -240,47 +240,60 @@ void ExecuteKernel< } // for each j block } -template class ExecuteKernel< - PackAWithRowOffset<uint8_t, int32_t>, - PackBMatrix<int8_t, int32_t>, - uint8_t, - ReQuantizeOutput<false /* FUSE_RELU*/>>; -template class ExecuteKernel< - PackAWithRowOffset<uint8_t, int32_t>, - PackBMatrix<int8_t, int32_t>, - uint8_t, - ReQuantizeOutput<true>>; - -template class ExecuteKernel< - PackAWithQuantRowOffset<uint8_t, int32_t>, - PackBMatrix<int8_t, int32_t>, - float, - ReQuantizeForFloat<false>>; - -template class ExecuteKernel< - PackAWithQuantRowOffset<uint8_t, int32_t>, - PackBMatrix<int8_t, int32_t>, - float, - ReQuantizeForFloat<true>>; - -template class ExecuteKernel< - PackAWithRowOffset<uint8_t, int32_t>, - PackBMatrix<int8_t, int32_t>, - float, - ReQuantizeForFloat<false>>; - -template class ExecuteKernel< - PackAWithRowOffset<uint8_t, int32_t>, - PackBMatrix<int8_t, int32_t>, - float, - ReQuantizeForFloat<true>>; - -template class ExecuteKernel< - PackAMatrix<uint8_t, int16_t>, - PackBMatrix<int8_t, int16_t>, - int32_t, - memCopy<>>; +//////////////////////////////////////////////////////////////////////////////// +// ReQuantizeOutput +#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithRowOffset<uint8_t, ACC_T>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ + ReQuantizeOutput<RELU, Q_GRAN>>; + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_Q_GRANS(ACC_T, false); \ + INSTANTIATE_Q_GRANS(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + uint8_t, \ + ReQuantizeOutput<RELU, Q_GRAN>>; + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_SPATIAL_DIM(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAMatrix<uint8_t, int16_t>, @@ -288,110 +301,127 @@ template class ExecuteKernel< uint8_t, ReQuantizeOutput<false>>; -template class ExecuteKernel< - PackAMatrix<uint8_t, int32_t>, - PackBMatrix<int8_t, int32_t>, - int32_t, - memCopy<>>; +//////////////////////////////////////////////////////////////////////////////// +// ReQuantizeForFloat +#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, int32_t>, \ + PackBMatrix<int8_t, int32_t>, \ + float, \ + ReQuantizeForFloat<RELU, Q_GRAN>>; + +#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_RELU(PACK_A) \ + INSTANTIATE_Q_GRANS(PACK_A, false); \ + INSTANTIATE_Q_GRANS(PACK_A, true); + +INSTANTIATE_RELU(PackAWithRowOffset); +INSTANTIATE_RELU(PackAWithQuantRowOffset); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + float, \ + ReQuantizeForFloat<RELU, Q_GRAN>>; + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_SPATIAL_DIM(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAWithRowOffset<uint8_t, int16_t>, PackBMatrix<int8_t, int16_t>, - uint8_t, - DoSpmdmOnInpBuffer< - ReQuantizeOutput<false>::outType, - int32_t, - ReQuantizeOutput<false>>>; + float, + ReQuantizeForFloat<false /* FUSE_RELU*/>>; -template class ExecuteKernel< - PackAWithRowOffset<uint8_t, int16_t>, - PackBMatrix<int8_t, int16_t>, - uint8_t, - DoSpmdmOnInpBuffer< - ReQuantizeOutput<true>::outType, - int32_t, - ReQuantizeOutput<true>>>; +//////////////////////////////////////////////////////////////////////////////// +// DoSpmdmOnInpBuffer +#define INSTANTIATE_BASE(RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithRowOffset<uint8_t, int16_t>, \ + PackBMatrix<int8_t, int16_t>, \ + uint8_t, \ + DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>; -template class ExecuteKernel< - PackAWithRowOffset<uint8_t, int16_t>, - PackBMatrix<int8_t, int16_t>, - float, - DoSpmdmOnInpBuffer< - ReQuantizeForFloat<false>::outType, - int32_t, - ReQuantizeForFloat<false>>>; +#define INSTANTIATE_Q_GRANS(RELU) \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); -template class ExecuteKernel< - PackAWithRowOffset<uint8_t, int16_t>, - PackBMatrix<int8_t, int16_t>, - uint8_t, - ReQuantizeOutput<false>>; +INSTANTIATE_Q_GRANS(false); +INSTANTIATE_Q_GRANS(true); -template class ExecuteKernel< - PackAWithRowOffset<uint8_t, int16_t>, - PackBMatrix<int8_t, int16_t>, - uint8_t, - ReQuantizeOutput<true>>; +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAWithRowOffset<uint8_t, int16_t>, PackBMatrix<int8_t, int16_t>, - int32_t, - memCopy<>>; + float, + DoSpmdmOnInpBuffer<float, int32_t, ReQuantizeForFloat<false>>>; -template class ExecuteKernel< - PackAWithIm2Col<uint8_t, int16_t>, - PackBMatrix<int8_t, int16_t>, - int32_t, - memCopy<>>; +//////////////////////////////////////////////////////////////////////////////// +// memCopy +#define INSTANTIATE_BASE(PACK_A, ACC_T) \ + template class ExecuteKernel< \ + PACK_A<uint8_t, ACC_T>, \ + PackBMatrix<int8_t, ACC_T>, \ + int32_t, \ + memCopy<>>; -template class ExecuteKernel< - PackAWithIm2Col<uint8_t, int16_t, 3>, - PackBMatrix<int8_t, int16_t>, - int32_t, - memCopy<>>; +#define INSTANTIATE_ACC_T(PACK_A) \ + INSTANTIATE_BASE(PACK_A, int32_t) \ + INSTANTIATE_BASE(PACK_A, int16_t) -template class ExecuteKernel< - PackAWithIm2Col<uint8_t, int16_t>, - PackBMatrix<int8_t, int16_t>, - uint8_t, - ReQuantizeOutput<false>>; +INSTANTIATE_ACC_T(PackAMatrix); +INSTANTIATE_ACC_T(PackAWithRowOffset); -template class ExecuteKernel< - PackAWithIm2Col<uint8_t, int16_t, 3>, - PackBMatrix<int8_t, int16_t>, - uint8_t, - ReQuantizeOutput<false>>; +#undef INSTANTIATE_ACC_T +#undef INSTANTIATE_BASE -template class ExecuteKernel< - PackAWithRowOffset<uint8_t, int32_t>, - PackBMatrix<int8_t, int32_t>, - int32_t, - memCopy<>>; - -template class ExecuteKernel< - PackAWithIm2Col<uint8_t, int32_t>, - PackBMatrix<int8_t, int32_t>, - int32_t, - memCopy<>>; +#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \ + template class ExecuteKernel< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + PackBMatrix<int8_t, ACC_T>, \ + int32_t, \ + memCopy<>>; -template class ExecuteKernel< - PackAWithIm2Col<uint8_t, int32_t, 3>, - PackBMatrix<int8_t, int32_t>, - int32_t, - memCopy<>>; +#define INSTANTIATE_SPATIAL_DIM(ACC_T) \ + INSTANTIATE_BASE(ACC_T, 2); \ + INSTANTIATE_BASE(ACC_T, 3); -template class ExecuteKernel< - PackAWithIm2Col<uint8_t, int32_t>, - PackBMatrix<int8_t, int32_t>, - uint8_t, - ReQuantizeOutput<false>>; +INSTANTIATE_SPATIAL_DIM(int32_t); +INSTANTIATE_SPATIAL_DIM(int16_t); -template class ExecuteKernel< - PackAWithIm2Col<uint8_t, int32_t, 3>, - PackBMatrix<int8_t, int32_t>, - uint8_t, - ReQuantizeOutput<false>>; +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAWithQuantRowOffset<uint8_t, int32_t>, @@ -400,12 +430,6 @@ template class ExecuteKernel< memCopy<>>; template class ExecuteKernel< - PackAWithRowOffset<uint8_t, int16_t>, - PackBMatrix<int8_t, int16_t>, - float, - ReQuantizeForFloat<false>>; - -template class ExecuteKernel< PackAMatrix<uint8_t, int16_t>, PackBMatrix<int8_t, int16_t>, int32_t, diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index 0039daf..a8bf02f 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -198,149 +198,70 @@ bool fbgemmSupportedCPU() { return (cpuinfo_initialize() && cpuinfo_has_x86_avx2()); } -template void fbgemmPacked( - PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput<false>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput<true>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& - packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - float* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeForFloat<false>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& - packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - float* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeForFloat<true>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAMatrix<uint8_t, int32_t>, uint8_t, int32_t>& packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - float* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeForFloat<false>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - float* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeForFloat<true>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithIm2Col<uint8_t, int32_t, 3>, uint8_t, int32_t>& packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput<false>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithIm2Col<uint8_t, int32_t, 3>, uint8_t, int32_t>& packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput<false>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& - packA, - PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -// 16 bit accumulation functions -template void fbgemmPacked( - PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); +//////////////////////////////////////////////////////////////////////////////// +// ReQuantizeOutput +#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix<PackAWithRowOffset<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \ + PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_Q_GRANS(ACC_T, false); \ + INSTANTIATE_Q_GRANS(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + uint8_t, \ + ACC_T>& packA, \ + PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_SPATIAL_DIM(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template void fbgemmPacked( PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA, @@ -352,28 +273,109 @@ template void fbgemmPacked( int thread_id, int num_threads); -template void fbgemmPacked( - PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<false>>& - outProcess, - int thread_id, - int num_threads); +//////////////////////////////////////////////////////////////////////////////// +// ReQuantizeForFloat +#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix<PACK_A<uint8_t, int32_t>, uint8_t, int32_t>& packA, \ + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, \ + float* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeForFloat<RELU, Q_GRAN>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_RELU(PACK_A) \ + INSTANTIATE_Q_GRANS(PACK_A, false); \ + INSTANTIATE_Q_GRANS(PACK_A, true); + +INSTANTIATE_RELU(PackAWithRowOffset); +INSTANTIATE_RELU(PackAWithQuantRowOffset); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + uint8_t, \ + ACC_T>& packA, \ + PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ + float* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeForFloat<RELU, Q_GRAN>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_SPATIAL_DIM(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template void fbgemmPacked( PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - uint8_t* C, + float* C, int32_t* C_buffer, uint32_t ldc, - const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<true>>& - outProcess, + const ReQuantizeForFloat<false>& outProcess, int thread_id, int num_threads); +//////////////////////////////////////////////////////////////////////////////// +// DoSpmdmOnInpBuffer +#define INSTANTIATE_BASE(RELU, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& \ + packA, \ + PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const DoSpmdmOnInpBuffer< \ + uint8_t, \ + int32_t, \ + ReQuantizeOutput<RELU, Q_GRAN>>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(RELU) \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); + +INSTANTIATE_Q_GRANS(false); +INSTANTIATE_Q_GRANS(true); + +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + template void fbgemmPacked( PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, @@ -385,49 +387,57 @@ template void fbgemmPacked( int thread_id, int num_threads); -template void fbgemmPacked( - PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput<false>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput<true>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); +//////////////////////////////////////////////////////////////////////////////// +// memCopy +#define INSTANTIATE_BASE(PACK_A, ACC_T) \ + template void fbgemmPacked( \ + PackMatrix<PACK_A<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \ + PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ + int32_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const memCopy<>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_ACC_T(PACK_A) \ + INSTANTIATE_BASE(PACK_A, int32_t) \ + INSTANTIATE_BASE(PACK_A, int16_t) + +INSTANTIATE_ACC_T(PackAMatrix); +INSTANTIATE_ACC_T(PackAWithRowOffset); + +#undef INSTANTIATE_ACC_T +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \ + template void fbgemmPacked( \ + PackMatrix< \ + PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \ + uint8_t, \ + ACC_T>& packA, \ + PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \ + int32_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const memCopy<>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T) \ + INSTANTIATE_BASE(ACC_T, 2); \ + INSTANTIATE_BASE(ACC_T, 3); + +INSTANTIATE_SPATIAL_DIM(int32_t); +INSTANTIATE_SPATIAL_DIM(int16_t); + +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_BASE template void fbgemmPacked( - PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithIm2Col<uint8_t, int16_t, 3>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, + PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& + packA, + PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, int32_t* C, int32_t* C_buffer, uint32_t ldc, @@ -436,26 +446,6 @@ template void fbgemmPacked( int num_threads); template void fbgemmPacked( - PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput<false>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix<PackAWithIm2Col<uint8_t, int16_t, 3>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput<false>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA, PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, int32_t* C, @@ -465,14 +455,4 @@ template void fbgemmPacked( int thread_id, int num_threads); -template void fbgemmPacked( - PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA, - PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, - float* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeForFloat<false>& outProcess, - int thread_id, - int num_threads); - } // namespace fbgemm diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc index 322287b..50f619a 100644 --- a/src/QuantUtils.cc +++ b/src/QuantUtils.cc @@ -436,13 +436,14 @@ void RequantizeAvx2( DoNothing<> doNothingObj{}; ReQuantizeOutput<false /* FUSE_RELU */> requantizeObj( doNothingObj, - params.real_multiplier, + ¶ms.real_multiplier, params.target_qparams.zero_point, 0, 0, nullptr, nullptr, - nullptr); + nullptr, + len); requantizeObj.f<inst_set_t::avx2>(dst, src, {0, 1, 0, len}, 0, 0); } #endif diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index 097e3b5..369aea3 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -57,24 +57,25 @@ void requantize_u8acc32_ref( int ld, const int32_t* inp, uint8_t* out, - float C_multiplier, + const float* C_multiplier, int32_t C_zero_point, int32_t A_zero_point, - int32_t B_zero_point, + const int32_t* B_zero_point, const int32_t* row_offsets, const int32_t* col_offsets, const int32_t* bias, + int ncols_per_quant_group, bool fuse_relu) { for (int i = 0; i < M; ++i) { for (int j = 0; j < N; ++j) { int32_t raw = inp[i * ld + j]; raw -= A_zero_point * col_offsets[j]; - raw -= B_zero_point * row_offsets[i]; + raw -= B_zero_point[j / ncols_per_quant_group] * row_offsets[i]; if (bias) { raw += bias[j]; } - float result = raw * C_multiplier; + float result = raw * C_multiplier[j / ncols_per_quant_group]; long rounded = lrintf(result) + C_zero_point; out[i * ld + j] = std::max( fuse_relu ? static_cast<long>(C_zero_point) : 0l, @@ -180,14 +181,15 @@ void col_offsets_with_zero_pt_s8acc32_ref( int N, int ld, const int8_t* Bint8, - int32_t B_zero_point, - int32_t* col_offsets) { + const int32_t* B_zero_point, + int32_t* col_offsets, + int ncols_per_quant_group) { for (int j = 0; j < N; ++j) { int32_t sum = 0; for (int k = 0; k < K; ++k) { sum += Bint8[k * ld + j]; } - col_offsets[j] = sum - B_zero_point * K; + col_offsets[j] = sum - B_zero_point[j / ncols_per_quant_group] * K; } } @@ -578,13 +580,14 @@ void depthwise_3x3_pad_1_ref( 1, C_int32.data() + i * K + k, C + i * K + k, - C_multiplier, + &C_multiplier, C_zero_point, A_zero_point, - B_zero_point, + &B_zero_point, &row_offsets[i * K + k], col_offsets + k, - bias ? bias + k : nullptr); + bias ? bias + k : nullptr, + 1); } } }; @@ -644,13 +647,14 @@ void depthwise_3x3_per_channel_quantization_pad_1_ref( 1, C_int32.data() + i * K + k, C + i * K + k, - C_multiplier[k], + &C_multiplier[k], C_zero_point, A_zero_point, - B_zero_point[k], + &B_zero_point[k], &row_offsets[i * K + k], col_offsets + k, - bias ? bias + k : nullptr); + bias ? bias + k : nullptr, + 1); } } }; @@ -781,13 +785,14 @@ void depthwise_3x3x3_pad_1_ref( 1, C_int32.data() + i * K + k, C + i * K + k, - C_multiplier, + &C_multiplier, C_zero_point, A_zero_point, - B_zero_point, + &B_zero_point, &row_offsets[i * K + k], col_offsets + k, - bias ? bias + k : nullptr); + bias ? bias + k : nullptr, + 1); } } }; diff --git a/src/RefImplementations.h b/src/RefImplementations.h index cec4bff..6530eff 100644 --- a/src/RefImplementations.h +++ b/src/RefImplementations.h @@ -39,6 +39,11 @@ void requantize_u8acc32_ref( * @brief Reference implementation of requantization step. * float multiplier * @params bias can be nullptr + * @params ncols_per_quant_group the number of columns share the same + * quantization parameter. + * ncols_per_quant_group == N : per-tensor quantization + * ncols_per_quant_group == N / groups : per-group quantization + * ncols_per_quant_group == 1 : per-channel quantization */ void requantize_u8acc32_ref( int M, @@ -46,13 +51,14 @@ void requantize_u8acc32_ref( int ld, const std::int32_t* inp, std::uint8_t* out, - float C_multiplier, + const float* C_multiplier, std::int32_t C_zero_point, std::int32_t A_zero_point, - std::int32_t B_zero_point, + const std::int32_t* B_zero_point, const std::int32_t* row_offsets, const std::int32_t* col_offsets, const std::int32_t* bias, + int ncols_per_quant_group, bool fuse_relu = false); /** @@ -114,14 +120,18 @@ void row_offsets_u8acc32_ref( /** * @brief Reference implementation to compute adjusted col_offsets (sum of * columns of B and adjusted with B_zero_point) + * + * @params ncols_per_quant_group see ncols_per_quant_group in + * requantize_u8acc32_ref */ void col_offsets_with_zero_pt_s8acc32_ref( int K, int N, int ld, const std::int8_t* Bint8, - std::int32_t B_zero_point, - std::int32_t* col_offsets); + const std::int32_t* B_zero_point, + std::int32_t* col_offsets, + int ncols_per_quant_group); /** * @brief Reference implementation of SPMDM (sparse matrix times dense matrix). diff --git a/test/FP16Test.cc b/test/FP16Test.cc index b5e4f13..0edcc4b 100644 --- a/test/FP16Test.cc +++ b/test/FP16Test.cc @@ -73,19 +73,17 @@ TEST_P(FBGemmFP16Test, Test) { } cerr << endl; - aligned_vector<float> A(m * k, 0.f); - aligned_vector<float> B(k * n, 0.f); + // initialize with small numbers + aligned_vector<int> Aint(m * k); + aligned_vector<int> Bint(k * n); + randFill(Aint, 0, 4); + randFill(Bint, 0, 4); + aligned_vector<float> A(Aint.begin(), Aint.end()); + aligned_vector<float> B(Bint.begin(), Bint.end()); + aligned_vector<float> C(m * n, NAN); - // initialize with small numbers - randFill(A, 0, 4); - randFill(B, 0, 4); - randFill(C, 0, 4); - - aligned_vector<float> A_ref, B_ref, C_ref; - A_ref = A; - B_ref = B; - C_ref = C; + aligned_vector<float> A_ref(A), B_ref(B), C_ref(C); if (atrans == matrix_op_t::Transpose) { transpose_matrix(A_ref.data(), k, m); diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc index 9a19f0f..f482783 100644 --- a/test/I8DepthwiseTest.cc +++ b/test/I8DepthwiseTest.cc @@ -85,10 +85,10 @@ TEST(FBGemmDepthWiseTest, Test3x3) { aligned_vector<int8_t> B(K * R * S); aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * K), C(C_ref.size()); - randFill(A, 0, 86); + randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; - randFill(B, -16, 16); + randFill<int8_t>(B, -16, 16); int32_t B_zero_point = 5; depthwise_3x3_pad_1_ref( @@ -211,10 +211,10 @@ TEST(FBGemmDepthWiseTest, Test3x3x3) { aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K), C(C_ref.size()); - randFill(A, 0, 86); + randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; - randFill(B, -16, 16); + randFill<int8_t>(B, -16, 16); int32_t B_zero_point = 5; depthwise_3x3x3_pad_1_ref( @@ -360,7 +360,7 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) { int32_t C_num_rows = N * H_OUT * W_OUT; aligned_vector<int32_t> C_ref(C_num_rows * K), C(C_ref.size()); - randFill(A, 0, 86); + randFill<uint8_t>(A, 0, 86); int32_t A_zero_point = 43; // Each row of G has a different range to really test per-channel @@ -368,7 +368,7 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) { vector<int32_t> B_zero_point(K); for (auto k = 0; k < K; ++k) { aligned_vector<int8_t> Bk(R * S); - randFill(Bk, -16 + k, 16 + k); + randFill<int8_t>(Bk, -16 + k, 16 + k); copy(Bk.begin(), Bk.end(), B.begin() + k * R * S); B_zero_point[k] = 5 + k; diff --git a/test/I8SpmdmTest.cc b/test/I8SpmdmTest.cc index 5bb7703..2090b63 100644 --- a/test/I8SpmdmTest.cc +++ b/test/I8SpmdmTest.cc @@ -66,7 +66,7 @@ TEST_P(fbgemmSPMDMTest, TestsSpMDM) { } aligned_vector<uint8_t> A(M * K); - randFill(A, 0, 255); + randFill<uint8_t>(A, 0, 255); CompressedSparseColumn B_csc(K_adjusted, N_adjusted); vector<int32_t> C(M * N); @@ -127,13 +127,8 @@ TEST_P(fbgemmSPMDMTest, TestsSpMDM) { #pragma omp parallel #endif { -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); int i_per_thread = (M + num_threads - 1) / num_threads; int i_begin = std::min(tid * i_per_thread, M); int i_end = std::min(i_begin + i_per_thread, M); diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc index c7de1a8..d8f3f7a 100644 --- a/test/Im2ColFusedRequantizeTest.cc +++ b/test/Im2ColFusedRequantizeTest.cc @@ -20,8 +20,22 @@ #include "src/RefImplementations.h" using namespace std; +using namespace fbgemm; -namespace fbgemm { +vector<QuantizationGranularity> qGranularityVals{ + QuantizationGranularity::TENSOR, + QuantizationGranularity::GROUP, + QuantizationGranularity::OUT_CHANNEL}; + +namespace { +class fbgemmIm2colTest + : public testing::TestWithParam<QuantizationGranularity> {}; +}; // namespace + +INSTANTIATE_TEST_CASE_P( + InstantiationName, + fbgemmIm2colTest, + ::testing::ValuesIn(qGranularityVals)); // From Faster-RCNN with ShuffleNet static vector<conv_param_t<>> shapes = { @@ -71,7 +85,7 @@ static vector<conv_param_t<>> shapes = { conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}), }; -template <typename ACC_T> +template <typename ACC_T, QuantizationGranularity Q_GRAN> static void Im2colTest() { for (auto conv_p : shapes) { for (int groups : {1, 4}) { @@ -80,29 +94,38 @@ static void Im2colTest() { } conv_p.G = groups; aligned_vector<uint8_t> Aint8( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0); + conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC); aligned_vector<int8_t> Bint8( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0); + conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC); aligned_vector<int32_t> Cint32_ref( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); - aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0); - aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); - aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0); - - int32_t Aint8_zero_point, Bint8_zero_point; + conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC); + aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size()); + aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size()); + + int ncols_per_quant_group = conv_p.OC; + if (Q_GRAN == QuantizationGranularity::GROUP) { + ncols_per_quant_group = conv_p.OC / conv_p.G; + } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + int32_t Aint8_zero_point; + aligned_vector<int32_t> Bint8_zero_point( + conv_p.OC / ncols_per_quant_group); if (is_same<ACC_T, int32_t>::value) { - randFill(Aint8, 0, 80); + randFill<uint8_t>(Aint8, 0, 80); Aint8_zero_point = 43; - randFill(Bint8, -16, 16); - Bint8_zero_point = -30; + randFill<int8_t>(Bint8, -16, 16); + randFill(Bint8_zero_point, -50, -10); } else { - randFill(Aint8, 0, 5); + randFill<uint8_t>(Aint8, 0, 5); Aint8_zero_point = 4; - randFill(Bint8, -4, 4); - Bint8_zero_point = -2; + randFill<int8_t>(Bint8, -4, 4); + randFill(Bint8_zero_point, -3, -1); } - float C_multiplier = 0.1234; + aligned_vector<float> C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1]; @@ -116,16 +139,16 @@ static void Im2colTest() { im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data()); // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(groups * NDim); + vector<int32_t> col_offsets(groups * NDim); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( KDimPerGroup, NDim, NDim, Bint8.data() + g * KDimPerGroup * NDim, - Bint8_zero_point, - col_offsets.data() + g * NDim); + Bint8_zero_point.data() + g * NDim / ncols_per_quant_group, + col_offsets.data() + g * NDim, + ncols_per_quant_group); } conv_ref( @@ -149,13 +172,14 @@ static void Im2colTest() { conv_p.G * NDim, Cint32_ref.data() + g * NDim, Cint8_ref.data() + g * NDim, - C_multiplier, + C_multiplier.data() + g * NDim / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data() + g * NDim / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * NDim, - nullptr); + nullptr, + ncols_per_quant_group); } PackBMatrix<int8_t, ACC_T> packedB( @@ -171,8 +195,7 @@ static void Im2colTest() { #pragma omp parallel #endif { - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithIm2Col<uint8_t, ACC_T>::rowOffsetBufferSize()); PackAWithIm2Col<uint8_t, ACC_T> packA( @@ -183,23 +206,20 @@ static void Im2colTest() { row_offset_buf.data()); DoNothing<> doNothingObj{}; - ReQuantizeOutput<false> outputProcObj( + ReQuantizeOutput<false, Q_GRAN> outputProcObj( doNothingObj, - C_multiplier, + C_multiplier.data(), C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data(), packA.getRowOffsetBuffer(), col_offsets.data(), - nullptr); + nullptr, + conv_p.G * NDim, + conv_p.G); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); fbgemmPacked( packA, @@ -236,12 +256,26 @@ static void Im2colTest() { } // for each shape } -TEST(FBGemmIm2colTest, Acc32Test) { - Im2colTest<int32_t>(); +TEST_P(fbgemmIm2colTest, Acc32Test) { + QuantizationGranularity q_granularity = GetParam(); + if (q_granularity == QuantizationGranularity::TENSOR) { + Im2colTest<int32_t, QuantizationGranularity::TENSOR>(); + } else if (q_granularity == QuantizationGranularity::GROUP) { + Im2colTest<int32_t, QuantizationGranularity::GROUP>(); + } else { + Im2colTest<int32_t, QuantizationGranularity::OUT_CHANNEL>(); + } } -TEST(FBGemmIm2colTest, Acc16Test) { - Im2colTest<int16_t>(); +TEST_P(fbgemmIm2colTest, Acc16Test) { + QuantizationGranularity q_granularity = GetParam(); + if (q_granularity == QuantizationGranularity::TENSOR) { + Im2colTest<int16_t, QuantizationGranularity::TENSOR>(); + } else if (q_granularity == QuantizationGranularity::GROUP) { + Im2colTest<int16_t, QuantizationGranularity::GROUP>(); + } else { + Im2colTest<int16_t, QuantizationGranularity::OUT_CHANNEL>(); + } } static vector<conv_param_t<3>> shapes_3d = { @@ -319,7 +353,7 @@ static vector<conv_param_t<3>> shapes_3d = { 3>(1, 8, 16, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, 0, 0}), }; -template <typename ACC_T> +template <typename ACC_T, QuantizationGranularity Q_GRAN> static void Im2col3DTest() { for (auto conv_p : shapes_3d) { for (int groups : {1, 4}) { @@ -329,32 +363,39 @@ static void Im2col3DTest() { conv_p.G = groups; aligned_vector<uint8_t> Aint8( conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IN_DIM[2] * - conv_p.IC, - 0); + conv_p.IC); aligned_vector<int8_t> Bint8( - conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC, 0); + conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC); aligned_vector<int32_t> Cint32_ref( conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * - conv_p.OUT_DIM[2] * conv_p.OC, - 0); - aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0); - aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); - aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0); - - int32_t Aint8_zero_point, Bint8_zero_point; + conv_p.OUT_DIM[2] * conv_p.OC); + aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size()); + aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size()); + + int ncols_per_quant_group = conv_p.OC; + if (Q_GRAN == QuantizationGranularity::GROUP) { + ncols_per_quant_group = conv_p.OC / conv_p.G; + } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + int32_t Aint8_zero_point; + aligned_vector<int32_t> Bint8_zero_point( + conv_p.OC / ncols_per_quant_group); if (is_same<ACC_T, int32_t>::value) { - randFill(Aint8, 0, 80); + randFill<uint8_t>(Aint8, 0, 80); Aint8_zero_point = 43; - randFill(Bint8, -16, 16); - Bint8_zero_point = -30; + randFill<int8_t>(Bint8, -16, 16); + randFill(Bint8_zero_point, -50, -10); } else { - randFill(Aint8, 0, 5); + randFill<uint8_t>(Aint8, 0, 5); Aint8_zero_point = 4; - randFill(Bint8, -4, 4); - Bint8_zero_point = -2; + randFill<int8_t>(Bint8, -4, 4); + randFill(Bint8_zero_point, -3, -1); } - float C_multiplier = 0.1234; + aligned_vector<float> C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; int MDim = @@ -369,16 +410,16 @@ static void Im2col3DTest() { im2col3d_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data()); // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(groups * NDim); + vector<int32_t> col_offsets(groups * NDim); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( KDimPerGroup, NDim, NDim, Bint8.data() + g * KDimPerGroup * NDim, - Bint8_zero_point, - col_offsets.data() + g * NDim); + Bint8_zero_point.data() + g * NDim / ncols_per_quant_group, + col_offsets.data() + g * NDim, + ncols_per_quant_group); } conv3d_ref( @@ -402,13 +443,14 @@ static void Im2col3DTest() { conv_p.G * NDim, Cint32_ref.data() + g * NDim, Cint8_ref.data() + g * NDim, - C_multiplier, + C_multiplier.data() + g * NDim / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data() + g * NDim / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * NDim, - nullptr); + nullptr, + ncols_per_quant_group); } PackBMatrix<int8_t, ACC_T> packedB( @@ -424,8 +466,7 @@ static void Im2col3DTest() { #pragma omp parallel #endif { - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithIm2Col<uint8_t, ACC_T, 3>::rowOffsetBufferSize()); PackAWithIm2Col<uint8_t, ACC_T, 3> packA( @@ -436,23 +477,20 @@ static void Im2col3DTest() { row_offset_buf.data()); DoNothing<> doNothingObj{}; - ReQuantizeOutput<false> outputProcObj( + ReQuantizeOutput<false, Q_GRAN> outputProcObj( doNothingObj, - C_multiplier, + C_multiplier.data(), C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data(), packA.getRowOffsetBuffer(), col_offsets.data(), - nullptr); + nullptr, + conv_p.G * NDim, + conv_p.G); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); fbgemmPacked( packA, @@ -495,12 +533,24 @@ static void Im2col3DTest() { } // for each shape } -TEST(FBGemmIm2colTest, 3DAcc32Test) { - Im2col3DTest<int32_t>(); +TEST_P(fbgemmIm2colTest, 3DAcc32Test) { + QuantizationGranularity q_granularity = GetParam(); + if (q_granularity == QuantizationGranularity::TENSOR) { + Im2col3DTest<int32_t, QuantizationGranularity::TENSOR>(); + } else if (q_granularity == QuantizationGranularity::GROUP) { + Im2col3DTest<int32_t, QuantizationGranularity::GROUP>(); + } else { + Im2col3DTest<int32_t, QuantizationGranularity::OUT_CHANNEL>(); + } } -TEST(FBGemmIm2colTest, 3DAcc16Test) { - Im2col3DTest<int16_t>(); +TEST_P(fbgemmIm2colTest, 3DAcc16Test) { + QuantizationGranularity q_granularity = GetParam(); + if (q_granularity == QuantizationGranularity::TENSOR) { + Im2col3DTest<int16_t, QuantizationGranularity::TENSOR>(); + } else if (q_granularity == QuantizationGranularity::GROUP) { + Im2col3DTest<int16_t, QuantizationGranularity::GROUP>(); + } else { + Im2col3DTest<int16_t, QuantizationGranularity::OUT_CHANNEL>(); + } } - -} // namespace fbgemm diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc index cb614cd..55f6e7f 100644 --- a/test/PackedRequantizeAcc16Test.cc +++ b/test/PackedRequantizeAcc16Test.cc @@ -25,17 +25,34 @@ using namespace std; using namespace fbgemm; -std::vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose, +vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose, matrix_op_t::Transpose}; +vector<QuantizationGranularity> qGranularityVals{ + QuantizationGranularity::TENSOR, + QuantizationGranularity::GROUP, + QuantizationGranularity::OUT_CHANNEL}; + namespace { -class fbgemmu8s8acc16test : public testing::TestWithParam< - std::tuple<matrix_op_t, matrix_op_t, bool>> {}; +class fbgemmu8s8acc16WithQuantGranularityTest + : public testing::TestWithParam< + tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {}; +class fbgemmu8s8acc16Test + : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {}; }; // namespace INSTANTIATE_TEST_CASE_P( InstantiationName, - fbgemmu8s8acc16test, + fbgemmu8s8acc16WithQuantGranularityTest, + ::testing::Combine( + ::testing::Values(matrix_op_t::NoTranspose), + ::testing::ValuesIn(transposeVals), + ::testing::Bool(), + ::testing::ValuesIn(qGranularityVals))); + +INSTANTIATE_TEST_CASE_P( + InstantiationName, + fbgemmu8s8acc16Test, ::testing::Combine( ::testing::Values(matrix_op_t::NoTranspose), ::testing::ValuesIn(transposeVals), @@ -77,11 +94,12 @@ static vector<vector<int>> GetShapes_() { * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit * accumulation. Output processing: requantization -> nothing */ -TEST_P(fbgemmu8s8acc16test, Test) { +TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, Test) { vector<vector<int>> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; - tie(atrans, btrans, test_ld) = GetParam(); + QuantizationGranularity q_granularity; + tie(atrans, btrans, test_ld, q_granularity) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { @@ -93,22 +111,21 @@ TEST_P(fbgemmu8s8acc16test, Test) { } int k_per_group = k / groups; - aligned_vector<uint8_t> Aint8(m * k, 0); + aligned_vector<uint8_t> Aint8(m * k); - aligned_vector<int8_t> Bint8(k * n, 0); - aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0); + aligned_vector<int8_t> Bint8_ref(k * n); - aligned_vector<int32_t> Cint32_ref(m * n * groups, 0); - aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0); - aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); - aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0); - aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0); + aligned_vector<int32_t> Cint32_ref(m * n * groups); + aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size()); + aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size()); - randFill(Aint8, 0, 255); + randFill<uint8_t>(Aint8, 0, 255); int32_t Aint8_zero_point = 43; - randFill(Bint8_ref, -128, 127); - Bint8 = Bint8_ref; + randFill<int8_t>(Bint8_ref, -128, 127); + aligned_vector<int8_t> Bint8(Bint8_ref); if (btrans == matrix_op_t::Transpose) { aligned_vector<int8_t> Bint8_temp(Bint8.size()); @@ -124,7 +141,6 @@ TEST_P(fbgemmu8s8acc16test, Test) { Bint8 = Bint8_temp; } - int32_t Bint8_zero_point = -30; // To test lda != k , we just reduce k by half and use the original k // as lda. int n_adjusted = n; @@ -137,23 +153,33 @@ TEST_P(fbgemmu8s8acc16test, Test) { } } + int ncols_per_quant_group = groups * n_adjusted; + if (q_granularity == QuantizationGranularity::GROUP) { + ncols_per_quant_group = n_adjusted; + } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + aligned_vector<int32_t> Bint8_zero_point( + groups * n_adjusted / ncols_per_quant_group); + randFill(Bint8_zero_point, -60, 0); + // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(groups * n_adjusted); + vector<int32_t> col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8_ref.data() + g * k_per_group * n, - Bint8_zero_point, - col_offsets.data() + g * n_adjusted); + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, + col_offsets.data() + g * n_adjusted, + ncols_per_quant_group); } - vector<int32_t> row_offsets; - row_offsets.resize(m); + vector<int32_t> row_offsets(m); - float C_multiplier = 0.1234; + aligned_vector<float> C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; int brow = 256; @@ -183,13 +209,14 @@ TEST_P(fbgemmu8s8acc16test, Test) { groups * n, Cint32_ref.data() + g * n_adjusted, Cint8_ref.data() + g * n_adjusted, - C_multiplier, + C_multiplier.data() + g * n_adjusted / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * n_adjusted, - nullptr); + nullptr, + ncols_per_quant_group); } PackBMatrix<int8_t, int16_t> packedBN( @@ -205,8 +232,7 @@ TEST_P(fbgemmu8s8acc16test, Test) { #pragma omp parallel #endif { - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); PackAWithRowOffset<uint8_t, int16_t> packAN( @@ -219,34 +245,79 @@ TEST_P(fbgemmu8s8acc16test, Test) { groups, row_offset_buf.data()); + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); + DoNothing<> doNothingObj{}; - ReQuantizeOutput<false> outputProcObj( - doNothingObj, - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - nullptr); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + if (q_granularity == QuantizationGranularity::TENSOR) { + ReQuantizeOutput<false> outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); - fbgemmPacked( - packAN, - packedBN, - Cint8_fb.data(), - Cint32_buffer.data(), - groups * n, - outputProcObj, - tid, - num_threads); + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } else if (q_granularity == QuantizationGranularity::GROUP) { + ReQuantizeOutput<false, QuantizationGranularity::GROUP> outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } else { + ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> + outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } } // omp parallel compare_validate_buffers( @@ -264,11 +335,12 @@ TEST_P(fbgemmu8s8acc16test, Test) { * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit * accumulation. Output processing: spmdm -> requantization -> nothing */ -TEST_P(fbgemmu8s8acc16test, SpMDMTest) { +TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, SpMDMTest) { vector<vector<int>> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; - tie(atrans, btrans, test_ld) = GetParam(); + QuantizationGranularity q_granularity; + tie(atrans, btrans, test_ld, q_granularity) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { @@ -283,21 +355,19 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { } int k_per_group = k / groups; - aligned_vector<uint8_t> Aint8(m * k, 0); + aligned_vector<uint8_t> Aint8(m * k); + aligned_vector<int8_t> Bint8(k * n); - aligned_vector<int8_t> Bint8(k * n, 0); - aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0); + aligned_vector<int32_t> Cint32_ref(m * n * groups); + aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size()); + aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size()); - aligned_vector<int32_t> Cint32_ref(m * n * groups, 0); - aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0); - aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); - aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0); - aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0); - - randFill(Aint8, 0, 255); + randFill<uint8_t>(Aint8, 0, 255); int32_t Aint8_zero_point = 43; - randFill(Bint8, -128, 127); + randFill<int8_t>(Bint8, -128, 127); // To test lda != k , we just reduce k by half and use the original k // as lda. @@ -311,18 +381,27 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { } } - int32_t Bint8_zero_point = -30; + int ncols_per_quant_group = groups * n_adjusted; + if (q_granularity == QuantizationGranularity::GROUP) { + ncols_per_quant_group = n_adjusted; + } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + aligned_vector<int32_t> Bint8_zero_point( + groups * n_adjusted / ncols_per_quant_group); + randFill(Bint8_zero_point, -50, -10); + // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(groups * n_adjusted); + vector<int32_t> col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, - Bint8_ref.data() + g * k_per_group * n, - Bint8_zero_point, - col_offsets.data() + g * n_adjusted); + Bint8.data() + g * k_per_group * n, + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, + col_offsets.data() + g * n_adjusted, + ncols_per_quant_group); } CompressedSparseColumn B_csc(k_per_group, groups * n_adjusted); @@ -366,7 +445,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { } B_csc.ColPtr()[groups * n_adjusted] = total_nnz; - Bint8_ref = Bint8; + aligned_vector<int8_t> Bint8_ref(Bint8); if (btrans == matrix_op_t::Transpose) { aligned_vector<int8_t> Bint8_temp(Bint8.size()); @@ -382,10 +461,10 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { Bint8 = Bint8_temp; } - vector<int32_t> row_offsets; - row_offsets.resize(m); + vector<int32_t> row_offsets(m); - float C_multiplier = 0.1234; + aligned_vector<float> C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; int brow = 256; @@ -428,13 +507,14 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { groups * n, Cint32_ref.data() + g * n_adjusted, Cint8_ref.data() + g * n_adjusted, - C_multiplier, + C_multiplier.data() + g * n_adjusted / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * n_adjusted, - nullptr); + nullptr, + ncols_per_quant_group); } PackBMatrix<int8_t, int16_t> packedB( @@ -450,8 +530,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { #pragma omp parallel #endif { - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); PackAWithRowOffset<uint8_t, int16_t> packAN( @@ -464,51 +543,106 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { groups, row_offset_buf.data()); + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); + // spmdm -> requantization -> nothing // construct an output processing pipeline in reverse order // i.e. last output operation first // Last operation should always be DoNothing with // correct input and output type. DoNothing<> doNothingObj{}; - // The second last operation is requantization back - // to int8 - ReQuantizeOutput<false> reqObj( - doNothingObj, - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - nullptr); - // the top most (first) operation in the output processing - // pipeline is spmdm - // outType = final output type after fullly processing through - // pipeline inType = initial input type at the first call to the whole - // pipeline - DoSpmdmOnInpBuffer< - ReQuantizeOutput<false>::outType, - int32_t, - ReQuantizeOutput<false>> - spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); - -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif - fbgemmPacked( - packAN, - packedB, - Cint8_fb.data(), - Cint32_fb.data(), - groups * n, - spmdmObj, - tid, - num_threads); + if (q_granularity == QuantizationGranularity::TENSOR) { + // The second last operation is requantization back + // to int8 + ReQuantizeOutput<false> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + // the top most (first) operation in the output processing + // pipeline is spmdm + // outType = final output type after fullly processing through + // pipeline inType = initial input type at the first call to the + // whole pipeline + DoSpmdmOnInpBuffer< + ReQuantizeOutput<false>::outType, + int32_t, + ReQuantizeOutput<false>> + spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); + + fbgemmPacked( + packAN, + packedB, + Cint8_fb.data(), + Cint32_fb.data(), + groups * n, + spmdmObj, + tid, + num_threads); + } else if (q_granularity == QuantizationGranularity::GROUP) { + ReQuantizeOutput<false, QuantizationGranularity::GROUP> reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + DoSpmdmOnInpBuffer< + ReQuantizeOutput<false>::outType, + int32_t, + ReQuantizeOutput<false, QuantizationGranularity::GROUP>> + spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); + + fbgemmPacked( + packAN, + packedB, + Cint8_fb.data(), + Cint32_fb.data(), + groups * n, + spmdmObj, + tid, + num_threads); + } else { + ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> + reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + DoSpmdmOnInpBuffer< + ReQuantizeOutput<false>::outType, + int32_t, + ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>> + spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); + + fbgemmPacked( + packAN, + packedB, + Cint8_fb.data(), + Cint32_fb.data(), + groups * n, + spmdmObj, + tid, + num_threads); + } } compare_validate_buffers( @@ -527,7 +661,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit * accumulation. Output processing: nothing */ -TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { +TEST_P(fbgemmu8s8acc16Test, NoRequantizeTest) { vector<vector<int>> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; @@ -543,20 +677,19 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { } int k_per_group = k / groups; - aligned_vector<uint8_t> Aint8(m * k, 0); + aligned_vector<uint8_t> Aint8(m * k); - aligned_vector<int8_t> Bint8(k * n, 0); - aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0); + aligned_vector<int8_t> Bint8_ref(k * n); - aligned_vector<int32_t> Cint32_ref(m * n * groups, 0); - aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); - aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0); + aligned_vector<int32_t> Cint32_ref(m * n * groups); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size()); - randFill(Aint8, 0, 255); + randFill<uint8_t>(Aint8, 0, 255); int32_t Aint8_zero_point = 43; - randFill(Bint8_ref, -128, 127); - Bint8 = Bint8_ref; + randFill<int8_t>(Bint8_ref, -128, 127); + aligned_vector<int8_t> Bint8(Bint8_ref); if (btrans == matrix_op_t::Transpose) { aligned_vector<int8_t> Bint8_temp(Bint8.size()); @@ -586,20 +719,19 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { } // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(groups * n_adjusted); + vector<int32_t> col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8_ref.data() + g * k_per_group * n, - Bint8_zero_point, - col_offsets.data() + g * n_adjusted); + &Bint8_zero_point, + col_offsets.data() + g * n_adjusted, + n_adjusted); } - vector<int32_t> row_offsets; - row_offsets.resize(m); + vector<int32_t> row_offsets(m); int brow = 256; for (int g = 0; g < groups; ++g) { @@ -636,8 +768,7 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { #pragma omp parallel #endif { - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize()); PackAWithRowOffset<uint8_t, int16_t> packAN( @@ -654,13 +785,8 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { DoNothing<int32_t, int32_t> doNothingObj{}; memCopy<> outputProcObj(doNothingObj); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); fbgemmPacked( packAN, diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc index a5744c0..9873e3f 100644 --- a/test/PackedRequantizeTest.cc +++ b/test/PackedRequantizeTest.cc @@ -25,17 +25,35 @@ using namespace std; using namespace fbgemm; -std::vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose, - matrix_op_t::Transpose}; +vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose, + matrix_op_t::Transpose}; + +vector<QuantizationGranularity> qGranularityVals{ + QuantizationGranularity::TENSOR, + QuantizationGranularity::GROUP, + QuantizationGranularity::OUT_CHANNEL}; namespace { -class fbgemmu8s8acc32test : public testing::TestWithParam< - std::tuple<matrix_op_t, matrix_op_t, bool>> {}; +class fbgemmu8s8acc32WithQuantGranularityTest + : public testing::TestWithParam< + tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {}; +class fbgemmu8s8acc32Test + : public testing::TestWithParam< + tuple<matrix_op_t, matrix_op_t, bool>> {}; }; // namespace INSTANTIATE_TEST_CASE_P( InstantiationName, - fbgemmu8s8acc32test, + fbgemmu8s8acc32WithQuantGranularityTest, + ::testing::Combine( + ::testing::Values(matrix_op_t::NoTranspose), + ::testing::ValuesIn(transposeVals), + ::testing::Bool(), + ::testing::ValuesIn(qGranularityVals))); + +INSTANTIATE_TEST_CASE_P( + InstantiationName, + fbgemmu8s8acc32Test, ::testing::Combine( ::testing::Values(matrix_op_t::NoTranspose), ::testing::ValuesIn(transposeVals), @@ -77,11 +95,12 @@ static vector<vector<int>> GetShapes_() { * @brief Unit test for uint8 matrix A, int8 matrix B, and 32-bit * accumulation. Output processing: requantization -> nothing */ -TEST_P(fbgemmu8s8acc32test, Test) { +TEST_P(fbgemmu8s8acc32WithQuantGranularityTest, Test) { vector<vector<int>> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; - tie(atrans, btrans, test_ld) = GetParam(); + QuantizationGranularity q_granularity; + tie(atrans, btrans, test_ld, q_granularity) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { @@ -95,22 +114,21 @@ TEST_P(fbgemmu8s8acc32test, Test) { int k_per_group = k / groups; // mxk matrix - aligned_vector<uint8_t> Aint8(m * k, 0); + aligned_vector<uint8_t> Aint8(m * k); // kxn matrix - aligned_vector<int8_t> Bint8(k * n, 0); - aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0); + aligned_vector<int8_t> Bint8_ref(k * n); - aligned_vector<int32_t> Cint32_ref(m * n * groups, 0); - aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0); - aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0); - aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0); - aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0); + aligned_vector<int32_t> Cint32_ref(m * n * groups); + aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_fb(Cint32_ref.size()); + aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size()); + aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size()); - randFill(Aint8, 0, 255); + randFill<uint8_t>(Aint8, 0, 255); int32_t Aint8_zero_point = 43; - randFill(Bint8_ref, -128, 127); + randFill<int8_t>(Bint8_ref, -128, 127); for (int g = 0; g < groups; ++g) { avoidOverflow( m, @@ -122,7 +140,7 @@ TEST_P(fbgemmu8s8acc32test, Test) { n); } - Bint8 = Bint8_ref; + aligned_vector<int8_t> Bint8(Bint8_ref); // initialize bias aligned_vector<int32_t> bias_int32(groups * n); @@ -146,7 +164,6 @@ TEST_P(fbgemmu8s8acc32test, Test) { Bint8 = Bint8_temp; } - int32_t Bint8_zero_point = -30; // To test lda != k , we just reduce k by half and use the original k // as lda. int n_adjusted = n; @@ -159,23 +176,33 @@ TEST_P(fbgemmu8s8acc32test, Test) { } } + int ncols_per_quant_group = groups * n_adjusted; + if (q_granularity == QuantizationGranularity::GROUP) { + ncols_per_quant_group = n_adjusted; + } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + aligned_vector<int32_t> Bint8_zero_point( + groups * n_adjusted / ncols_per_quant_group); + randFill(Bint8_zero_point, -50, -10); + // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(groups * n_adjusted); + vector<int32_t> col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8_ref.data() + g * k_per_group * n, - Bint8_zero_point, - col_offsets.data() + g * n_adjusted); + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, + col_offsets.data() + g * n_adjusted, + ncols_per_quant_group); } - vector<int32_t> row_offsets; - row_offsets.resize(m); + vector<int32_t> row_offsets(m); - float C_multiplier = 0.001234; + aligned_vector<float> C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; for (int g = 0; g < groups; ++g) { @@ -203,13 +230,14 @@ TEST_P(fbgemmu8s8acc32test, Test) { groups * n, Cint32_ref.data() + g * n_adjusted, Cint8_ref.data() + g * n_adjusted, - C_multiplier, + C_multiplier.data() + g * n_adjusted / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * n_adjusted, - bias ? (bias + g * n_adjusted) : nullptr); + bias ? (bias + g * n_adjusted) : nullptr, + ncols_per_quant_group); } PackBMatrix<int8_t> packedBN( @@ -225,8 +253,7 @@ TEST_P(fbgemmu8s8acc32test, Test) { #pragma omp parallel #endif { - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithRowOffset<uint8_t>::rowOffsetBufferSize()); PackAWithRowOffset<uint8_t> packAN( @@ -239,34 +266,80 @@ TEST_P(fbgemmu8s8acc32test, Test) { groups, row_offset_buf.data()); - DoNothing<> doNothingObj{}; - ReQuantizeOutput<false> outputProcObj( - doNothingObj, - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - bias); + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + DoNothing<> doNothingObj{}; - fbgemmPacked( - packAN, - packedBN, - Cint8_fb.data(), - Cint32_buffer.data(), - groups * n, - outputProcObj, - tid, - num_threads); + if (q_granularity == QuantizationGranularity::TENSOR) { + ReQuantizeOutput<false> outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + bias, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } else if (q_granularity == QuantizationGranularity::GROUP) { + ReQuantizeOutput<false, QuantizationGranularity::GROUP> + outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + bias, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } else { + ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL> + outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + bias, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } } // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(), // m, n_adjusted, n, "C local"); @@ -287,11 +360,12 @@ TEST_P(fbgemmu8s8acc32test, Test) { * accumulation. Directly output fp32 matrix C. Output processing: * requantization -> nothing */ -TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { +TEST_P(fbgemmu8s8acc32WithQuantGranularityTest, TestFloatInputOutput) { vector<vector<int>> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; - tie(atrans, btrans, test_ld) = GetParam(); + QuantizationGranularity q_granularity; + tie(atrans, btrans, test_ld, q_granularity) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { @@ -303,26 +377,26 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { } int k_per_group = k / groups; - aligned_vector<float> Afp32(m * k, 0.0f); - aligned_vector<uint8_t> Aint8(Afp32.size(), 0); + aligned_vector<float> Afp32(m * k); + aligned_vector<uint8_t> Aint8(Afp32.size()); - aligned_vector<float> Bfp32(k * n, 0.0f); - aligned_vector<int8_t> Bint8(Bfp32.size(), 0); + aligned_vector<float> Bfp32(k * n); + aligned_vector<int8_t> Bint8(Bfp32.size()); - aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f); - aligned_vector<float> Cfp32_fb(Cfp32_ref.size(), 0.0f); + aligned_vector<float> Cfp32_ref(m * n * groups); + aligned_vector<float> Cfp32_fb(Cfp32_ref.size()); - aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size(), 0); - aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size(), 0); + aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size()); + aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size()); - randFill(Aint8, 0, 255); + randFill<uint8_t>(Aint8, 0, 255); int32_t Aint8_zero_point = 43; float Aint8_scale = 0.11; for (auto i = 0; i < Afp32.size(); ++i) { Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point); } - randFill(Bint8, -128, 127); + randFill<int8_t>(Bint8, -128, 127); for (int g = 0; g < groups; ++g) { avoidOverflow( m, @@ -333,11 +407,6 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { Bint8.data() + g * k_per_group * n, n); } - int32_t Bint8_zero_point = -30; - float Bint8_scale = 0.49; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point); - } // To test lda != k , we just reduce k by half and use the original k // as lda. @@ -351,17 +420,37 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { } } + int ncols_per_quant_group = groups * n_adjusted; + if (q_granularity == QuantizationGranularity::GROUP) { + ncols_per_quant_group = n_adjusted; + } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + aligned_vector<int32_t> Bint8_zero_point( + groups * n_adjusted / ncols_per_quant_group); + randFill(Bint8_zero_point, -50, -10); + aligned_vector<float> Bint8_scale(Bint8_zero_point.size()); + randFill(Bint8_scale, 0.49f / 2, 0.49f * 3 / 2); + for (int i = 0; i < k; ++i) { + int g = i / k_per_group; + for (int j = 0; j < n_adjusted; ++j) { + int quant_group = (g * n_adjusted + j) / ncols_per_quant_group; + Bfp32[i * n + j] = Bint8_scale[quant_group] * + (Bint8[i * n + j] - Bint8_zero_point[quant_group]); + } + } + // computing column offset - vector<int32_t> col_offsets; - col_offsets.resize(groups * n_adjusted); + vector<int32_t> col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8.data() + g * k_per_group * n, - Bint8_zero_point, - col_offsets.data() + g * n_adjusted); + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, + col_offsets.data() + g * n_adjusted, + ncols_per_quant_group); } if (btrans == matrix_op_t::Transpose) { @@ -404,8 +493,7 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { #pragma omp parallel #endif { - vector<int32_t> row_offset_buf; - row_offset_buf.resize( + vector<int32_t> row_offset_buf( PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize()); PackAWithQuantRowOffset<uint8_t> packAN( @@ -420,47 +508,96 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { groups, row_offset_buf.data()); + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); + DoNothing<float, float> doNothingObj{}; - ReQuantizeForFloat<false> outputProcObj( - doNothingObj, - Aint8_scale, - Bint8_scale, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - nullptr); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + if (q_granularity == QuantizationGranularity::TENSOR) { + ReQuantizeForFloat<false> outputProcObj( + doNothingObj, + Aint8_scale, + Bint8_scale.data(), + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); - fbgemmPacked( - packAN, - packedBN, - Cfp32_fb.data(), - reinterpret_cast<int32_t*>(Cfp32_fb.data()), - groups * n, - outputProcObj, - tid, - num_threads); - } + fbgemmPacked( + packAN, + packedBN, + Cfp32_fb.data(), + reinterpret_cast<int32_t*>(Cfp32_fb.data()), + groups * n, + outputProcObj, + tid, + num_threads); + } else if (q_granularity == QuantizationGranularity::GROUP) { + ReQuantizeForFloat<false, QuantizationGranularity::GROUP> + outputProcObj( + doNothingObj, + Aint8_scale, + Bint8_scale.data(), + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); - float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end()); - float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end()); - float atol = (maximum - minimum) / 255 / 1.9; + fbgemmPacked( + packAN, + packedBN, + Cfp32_fb.data(), + reinterpret_cast<int32_t*>(Cfp32_fb.data()), + groups * n, + outputProcObj, + tid, + num_threads); + } else { + ReQuantizeForFloat<false, QuantizationGranularity::OUT_CHANNEL> + outputProcObj( + doNothingObj, + Aint8_scale, + Bint8_scale.data(), + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + fbgemmPacked( + packAN, + packedBN, + Cfp32_fb.data(), + reinterpret_cast<int32_t*>(Cfp32_fb.data()), + groups * n, + outputProcObj, + tid, + num_threads); + } + } + + float maximum = 0; + for (int i = 0; i < m; ++i) { + for (int j = 0; j < groups * n_adjusted; ++j) { + float c = Cfp32_ref[i * groups * n + j]; + maximum = std::max(maximum, std::abs(c)); + } + } compare_validate_buffers( Cfp32_ref.data(), Cfp32_fb.data(), m, groups * n_adjusted, groups * n, - atol); + maximum * 1e-5f); } // for each groups } // for each shape } @@ -470,7 +607,7 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { * accumulation. Output processing: requantization -> nothing. Symmetric: the * zero point is 0. */ -TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { +TEST_P(fbgemmu8s8acc32Test, TestSymmetricQuantizedInputOutput) { vector<vector<int>> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; @@ -486,22 +623,17 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { } int k_per_group = k / groups; - aligned_vector<float> Afp32(m * k, 0.0f); - aligned_vector<uint8_t> Aint8(Afp32.size(), 0); + aligned_vector<uint8_t> Aint8(m * k); + aligned_vector<int8_t> Bint8(k * n); - aligned_vector<float> Bfp32(k * n, 0.0f); - aligned_vector<int8_t> Bint8(Bfp32.size(), 0); + aligned_vector<float> Cfp32_ref(m * n * groups); + aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size()); - aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f); - aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size(), 0); - - randFill(Afp32, 0, 255); - for (auto i = 0; i < Afp32.size(); i++) { - Aint8[i] = (uint8_t)Afp32[i]; - } + randFill<uint8_t>(Aint8, 0, 255); + aligned_vector<float> Afp32(Aint8.begin(), Aint8.end()); // initialize B matrix - randFill(Bfp32, -128, 127); + randFill<int8_t>(Bint8, -128, 127); for (int g = 0; g < groups; ++g) { avoidOverflow( m, @@ -509,13 +641,11 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { k_per_group, Aint8.data() + g * k_per_group, k, - Bfp32.data() + g * k_per_group * n, + Bint8.data() + g * k_per_group * n, n); } - for (auto i = 0; i < Bfp32.size(); ++i) { - Bint8[i] = (int8_t)Bfp32[i]; - } + aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end()); // To test lda != k , we just reduce k by half and use the original k // as lda. @@ -577,13 +707,8 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { DoNothing<int32_t, int32_t> doNothingObj{}; memCopy<> outputProcObj(doNothingObj); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); fbgemmPacked( packAN, |