Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/FBGEMM.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bench/BenchUtils.cc51
-rw-r--r--bench/BenchUtils.h5
-rw-r--r--bench/Depthwise3DBenchmark.cc22
-rw-r--r--bench/DepthwiseBenchmark.cc22
-rw-r--r--bench/FP16Benchmark.cc34
-rw-r--r--bench/I8SpmdmBenchmark.cc11
-rw-r--r--bench/Im2ColFusedRequantizeAcc16Benchmark.cc37
-rw-r--r--bench/Im2ColFusedRequantizeAcc32Benchmark.cc36
-rw-r--r--bench/PackedFloatInOutBenchmark.cc35
-rw-r--r--bench/PackedRequantizeAcc16Benchmark.cc125
-rw-r--r--bench/PackedRequantizeAcc32Benchmark.cc66
-rw-r--r--include/fbgemm/Fbgemm.h46
-rw-r--r--include/fbgemm/OutputProcessing-inl.h173
-rw-r--r--src/ExecuteKernelU8S8.cc284
-rw-r--r--src/Fbgemm.cc434
-rw-r--r--src/QuantUtils.cc5
-rw-r--r--src/RefImplementations.cc37
-rw-r--r--src/RefImplementations.h18
-rw-r--r--test/FP16Test.cc20
-rw-r--r--test/I8DepthwiseTest.cc12
-rw-r--r--test/I8SpmdmTest.cc11
-rw-r--r--test/Im2ColFusedRequantizeTest.cc220
-rw-r--r--test/PackedRequantizeAcc16Test.cc418
-rw-r--r--test/PackedRequantizeTest.cc393
24 files changed, 1466 insertions, 1049 deletions
diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
index db40ee0..f5ce9ef 100644
--- a/bench/BenchUtils.cc
+++ b/bench/BenchUtils.cc
@@ -5,30 +5,41 @@
* LICENSE file in the root directory of this source tree.
*/
#include "BenchUtils.h"
+
+#include <algorithm>
#include <random>
+#include <type_traits>
+
+#include <omp.h>
namespace fbgemm {
std::default_random_engine eng;
template <typename T>
-void randFill(aligned_vector<T>& vec, const int low, const int high) {
- std::random_device r;
- std::uniform_int_distribution<int> dis(low, high);
- for (auto& v : vec) {
- v = static_cast<T>(dis(eng));
- }
+void randFill(aligned_vector<T>& vec, T low, T high, std::true_type) {
+ std::uniform_int_distribution<T> dis(low, high);
+ std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
+}
+
+template <typename T>
+void randFill(aligned_vector<T>& vec, T low, T high, std::false_type) {
+ std::uniform_real_distribution<T> dis(low, high);
+ std::generate(vec.begin(), vec.end(), [&] { return dis(eng); });
+}
+
+template <typename T>
+void randFill(aligned_vector<T>& vec, T low, T high) {
+ randFill(vec, low, high, std::is_integral<T>());
}
template void
-randFill<float>(aligned_vector<float>& vec, const int low, const int high);
-template void
-randFill<uint8_t>(aligned_vector<uint8_t>& vec, const int low, const int high);
+randFill<float>(aligned_vector<float>& vec, float low, float high);
template void
-randFill<int8_t>(aligned_vector<int8_t>& vec, const int low, const int high);
-
+randFill<uint8_t>(aligned_vector<uint8_t>& vec, uint8_t low, uint8_t high);
template void
-randFill<int>(aligned_vector<int>& vec, const int low, const int high);
+randFill<int8_t>(aligned_vector<int8_t>& vec, int8_t low, int8_t high);
+template void randFill<int>(aligned_vector<int>& vec, int low, int high);
void llc_flush(std::vector<char>& llc) {
volatile char* data = llc.data();
@@ -37,4 +48,20 @@ void llc_flush(std::vector<char>& llc) {
}
}
+int fbgemm_get_num_threads() {
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+ return 1;
+#else
+ return omp_get_num_threads();
+#endif
+}
+
+int fbgemm_get_thread_num() {
+#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
+ return 0;
+#else
+ return omp_get_thread_num();
+#endif
+}
+
} // namespace fbgemm
diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
index 8ca99df..da2ef2d 100644
--- a/bench/BenchUtils.h
+++ b/bench/BenchUtils.h
@@ -11,8 +11,11 @@
namespace fbgemm {
template <typename T>
-void randFill(aligned_vector<T>& vec, const int low, const int high);
+void randFill(aligned_vector<T>& vec, T low, T high);
void llc_flush(std::vector<char>& llc);
+int fbgemm_get_num_threads();
+int fbgemm_get_thread_num();
+
} // namespace fbgemm
diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
index f53eeea..c65839b 100644
--- a/bench/Depthwise3DBenchmark.cc
+++ b/bench/Depthwise3DBenchmark.cc
@@ -62,10 +62,10 @@ int main() {
aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
C(C_ref.size());
- randFill(A, 0, 86);
+ randFill<uint8_t>(A, 0, 86);
int32_t A_zero_point = 43;
- randFill(B, -16, 16);
+ randFill<int8_t>(B, -16, 16);
int32_t B_zero_point = 5;
depthwise_3x3x3_pad_1_ref(
@@ -129,13 +129,8 @@ int main() {
t_begin = chrono::system_clock::now();
#pragma omp parallel
{
-#if _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
depthwise_3x3x3_pad_1(
N,
T,
@@ -200,13 +195,8 @@ int main() {
t_begin = chrono::system_clock::now();
#pragma omp parallel
{
-#if _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
depthwise_3x3x3_pad_1(
N,
T,
diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
index 8e6d83d..b922f90 100644
--- a/bench/DepthwiseBenchmark.cc
+++ b/bench/DepthwiseBenchmark.cc
@@ -161,10 +161,10 @@ int main() {
aligned_vector<int8_t> B(G * R * S);
aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * G), C(C_ref.size());
- randFill(A, 0, 86);
+ randFill<uint8_t>(A, 0, 86);
int32_t A_zero_point = 43;
- randFill(B, -16, 16);
+ randFill<int8_t>(B, -16, 16);
int32_t B_zero_point = 5;
depthwise_3x3_pad_1_ref(
@@ -221,13 +221,8 @@ int main() {
t_begin = chrono::system_clock::now();
#pragma omp parallel
{
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
depthwise_3x3_pad_1(
N,
H,
@@ -279,13 +274,8 @@ int main() {
t_begin = chrono::system_clock::now();
#pragma omp parallel
{
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
depthwise_3x3_pad_1(
N,
H,
diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc
index c03f18a..fd9de5b 100644
--- a/bench/FP16Benchmark.cc
+++ b/bench/FP16Benchmark.cc
@@ -73,20 +73,24 @@ void performance_test() {
int n = s[1];
int k = s[2];
- aligned_vector<float> A(m * k, 0.f);
- aligned_vector<float> B(k * n, 0.f);
- aligned_vector<float> Cg(m * n, 1.f);
- aligned_vector<float> Cp(m * n, NAN);
+ aligned_vector<float> C_ref(m * n, 1.f);
+ aligned_vector<float> C_fb(m * n, NAN);
// initialize with small numbers
- randFill(A, 0, 4);
+ aligned_vector<int> Aint(m * k);
+ randFill(Aint, 0, 4);
+ aligned_vector<float> A(Aint.begin(), Aint.end());
- randFill(B, 0, 4);
+ aligned_vector<int> Bint(k * n);
+ randFill(Bint, 0, 4);
+ aligned_vector<float> B(Bint.begin(), Bint.end());
PackedGemmMatrixFP16 Bp(btran, k, n, alpha, B.data());
if (beta != 0.0f) {
- randFill(Cg, 0, 4);
- Cp = Cg;
+ aligned_vector<int> Cint(C_ref.size());
+ randFill(Cint, 0, 4);
+ C_ref.assign(Cint.begin(), Cint.end());
+ C_fb = C_ref;
}
double nflops = 2.0 * (double)m * (double)n * (double)k * (double)NITER;
@@ -111,17 +115,17 @@ void performance_test() {
B.data(),
(btran == matrix_op_t::NoTranspose) ? n : k,
beta,
- Cg.data(),
+ C_ref.data(),
n);
#endif
cblas_gemm_compute(
- matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
+ matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
#ifdef USE_MKL
// Compare results
- for (auto i = 0; i < Cg.size(); i++) {
- // printf("%f %f\n", Cg[i], Cp[i]);
- assert(std::abs(Cg[i] - Cp[i]) < 1e-3);
+ for (auto i = 0; i < C_ref.size(); i++) {
+ // printf("%f %f\n", C_ref[i], C_fb[i]);
+ assert(std::abs(C_ref[i] - C_fb[i]) < 1e-3);
}
#endif
}
@@ -151,7 +155,7 @@ void performance_test() {
B.data(),
(btran == matrix_op_t::NoTranspose) ? n : k,
beta,
- Cg.data(),
+ C_ref.data(),
n);
t_end = chrono::system_clock::now();
if (it >= 0) {
@@ -184,7 +188,7 @@ void performance_test() {
t_begin = chrono::system_clock::now();
cblas_gemm_compute(
- matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data());
+ matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data());
t_end = chrono::system_clock::now();
if (it >= 0) {
diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc
index 07b73dc..4223d0c 100644
--- a/bench/I8SpmdmBenchmark.cc
+++ b/bench/I8SpmdmBenchmark.cc
@@ -77,7 +77,7 @@ int main() {
cout << M << ", " << N << ", " << K << ", ";
aligned_vector<uint8_t> A(M * K);
- randFill(A, 0, 255);
+ randFill<uint8_t>(A, 0, 255);
fbgemm::CompressedSparseColumn B_csc(K, N);
vector<int32_t> C(M * N);
@@ -156,13 +156,8 @@ int main() {
#pragma omp parallel
#endif
{
-#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP)
- int num_threads = 1;
- int tid = 0;
-#else
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
int i_per_thread =
((M + 31) / 32 + num_threads - 1) / num_threads * 32;
int i_begin = std::min(tid * i_per_thread, M);
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
index 2115863..cb2edf5 100644
--- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -125,43 +125,29 @@ void performance_test() {
chrono::time_point<chrono::high_resolution_clock> begin, end;
for (auto conv_p : shapes) {
- aligned_vector<float> Afp32(
- conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f);
aligned_vector<uint8_t> Aint8(
- conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
-
+ conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
aligned_vector<uint8_t> Aint8_out(
conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
- conv_p.K[1] * conv_p.IC,
- 0);
+ conv_p.K[1] * conv_p.IC);
- aligned_vector<float> Bfp32(
- conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f);
aligned_vector<int8_t> Bint8(
- conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+ conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
aligned_vector<int32_t> Cint32_ref(
- conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
- aligned_vector<int32_t> Cint32_fb(
- conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
- aligned_vector<int32_t> Cint32_fb2(
- conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+ conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
// A matrix (input activations)
- randFill(Afp32, 0, 5);
+ randFill<uint8_t>(Aint8, 0, 5);
int32_t Aint8_zero_point = 4;
- for (auto i = 0; i < Afp32.size(); ++i) {
- Aint8[i] = static_cast<uint8_t>(Afp32[i]);
- }
+ aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
// B matrix (weights)
- randFill(Bfp32, -4, 4);
+ randFill<int8_t>(Bint8, -4, 4);
// int32_t Bint8_zero_point = -3;
- for (auto i = 0; i < Bfp32.size(); ++i) {
- Bint8[i] = static_cast<int8_t>(Bfp32[i]);
- }
+ aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
// reference implementation
conv_ref(
@@ -184,8 +170,7 @@ void performance_test() {
double ttot = 0.0;
string runType;
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithIm2Col<uint8_t, int16_t>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, int16_t> packA(
diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
index 7144e61..8e112d8 100644
--- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
+++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
@@ -125,45 +125,32 @@ void performance_test() {
chrono::time_point<chrono::high_resolution_clock> begin, end;
for (auto conv_p : shapes) {
- aligned_vector<float> Afp32(
- conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f);
aligned_vector<uint8_t> Aint8(
- conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
+ conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
aligned_vector<uint8_t> Aint8_out(
conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] *
- conv_p.K[1] * conv_p.IC,
- 0);
+ conv_p.K[1] * conv_p.IC);
- aligned_vector<float> Bfp32(
- conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f);
aligned_vector<int8_t> Bint8(
- conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+ conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
aligned_vector<int32_t> Cint32_ref(
- conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
- aligned_vector<int32_t> Cint32_fb(
- conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
-
- aligned_vector<int32_t> Cint32_fb2(
- conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
+ conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_fb2(Cint32_ref.size());
// cout << conv_p.toString() << endl;
// A matrix (input activations)
- randFill(Afp32, 0, 5);
+ randFill<uint8_t>(Aint8, 0, 5);
int32_t Aint8_zero_point = 4;
- for (auto i = 0; i < Afp32.size(); ++i) {
- Aint8[i] = static_cast<uint8_t>(Afp32[i]);
- }
+ aligned_vector<float> Apf32(Aint8.begin(), Aint8.end());
// B matrix (weights)
- randFill(Bfp32, -4, 4);
+ randFill<int8_t>(Bint8, -4, 4);
// int32_t Bint8_zero_point = -3;
- for (auto i = 0; i < Bfp32.size(); ++i) {
- Bint8[i] = static_cast<int8_t>(Bfp32[i]);
- }
+ aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
// reference implementation
conv_ref(
@@ -186,8 +173,7 @@ void performance_test() {
double ttot = 0.0;
string runType;
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithIm2Col<uint8_t, int32_t>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, int32_t> packA(
diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
index badbda0..79a750e 100644
--- a/bench/PackedFloatInOutBenchmark.cc
+++ b/bench/PackedFloatInOutBenchmark.cc
@@ -86,27 +86,27 @@ void performance_test() {
int k = shape[2];
float alpha = 1.f, beta = 0.f;
- aligned_vector<float> Afp32(m * k, 0.0f);
- aligned_vector<uint8_t> Aint8(m * k, 0);
+ aligned_vector<float> Afp32(m * k);
+ aligned_vector<uint8_t> Aint8(Afp32.size());
- aligned_vector<float> Bfp32(k * n, 0.0f);
- aligned_vector<int8_t> Bint8(k * n, 0);
+ aligned_vector<float> Bfp32(k * n);
+ aligned_vector<int8_t> Bint8(Bfp32.size());
- aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
- aligned_vector<float> Cfp32_fb(m * n, 0.0f);
+ aligned_vector<float> Cfp32_mkl(m * n);
+ aligned_vector<float> Cfp32_fb(Cfp32_mkl.size());
- aligned_vector<uint8_t> Cint8_fb(m * n, 0);
- aligned_vector<int32_t> Cint32_buffer(m * n, 0);
+ aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
+ aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size());
// A matrix
- randFill(Aint8, 0, 255);
+ randFill<uint8_t>(Aint8, 0, 255);
float Aint8_scale = 0.11;
int32_t Aint8_zero_point = 43;
for (auto i = 0; i < Afp32.size(); ++i) {
Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
}
- randFill(Bint8, -128, 127);
+ randFill<int8_t>(Bint8, -128, 127);
avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
float Bint8_scale = 0.49;
@@ -116,10 +116,9 @@ void performance_test() {
}
// computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(n);
+ vector<int32_t> col_offsets(n);
col_offsets_with_zero_pt_s8acc32_ref(
- k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+ k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
double ttot = 0;
std::string type;
@@ -172,8 +171,7 @@ void performance_test() {
// printMatrix(matrix_op_t::NoTranspose, col_offsets.data(), 1, n, n, "col
// offsets before");
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
PackAWithQuantRowOffset<uint8_t> packAN(
@@ -201,12 +199,13 @@ void performance_test() {
ReQuantizeForFloat<false> outputProcObj(
doNothingObj,
Aint8_scale,
- Bint8_scale,
+ &Bint8_scale,
Aint8_zero_point,
- Bint8_zero_point,
+ &Bint8_zero_point,
packAN.getRowOffsetBuffer(),
col_offsets.data(),
- nullptr);
+ nullptr,
+ n);
ttot = 0;
type = "FBGEMM_i8_acc32";
diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
index fd48b24..f60332f 100644
--- a/bench/PackedRequantizeAcc16Benchmark.cc
+++ b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -100,29 +100,26 @@ void performance_test() {
int n = shape[1];
int k = shape[2];
- float alpha = 1.f, beta = 0.f;
- aligned_vector<float> Afp32(m * k, 0.0f);
- aligned_vector<uint8_t> Aint8(m * k, 0);
+ float alpha = 1.0f, beta = 0.0f;
+ aligned_vector<uint8_t> Aint8(m * k);
+ aligned_vector<int8_t> Bint8(k * n);
- aligned_vector<float> Bfp32(k * n, 0.0f);
- aligned_vector<int8_t> Bint8(k * n, 0);
-
- aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
+ aligned_vector<float> Cfp32_mkl(m * n);
// just used for result comparisons
- aligned_vector<int32_t> Cint32_mkl(m * n, 0.0f);
+ aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size());
// requantize results
- aligned_vector<uint8_t> Cint8_mkl(m * n, 0.0f);
- aligned_vector<int32_t> Cint32_fb(m * n, 0.0f);
- aligned_vector<uint8_t> Cint8_fb(m * n, 0.0f);
+ aligned_vector<uint8_t> Cint8_mkl(Cfp32_mkl.size());
+ aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size());
+ aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
// A matrix
- randFill(Afp32, 0, 50);
+ randFill<uint8_t>(Aint8, 0, 50);
int32_t Aint8_zero_point = 43;
- for (auto i = 0; i < Afp32.size(); ++i) {
- Aint8[i] = static_cast<uint8_t>(Afp32[i]);
- }
+ aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
- randFill(Bfp32, -8, 8);
+ randFill<int8_t>(Bint8, -8, 8);
+ aligned_vector<int8_t> Bint8_copy(Bint8);
+ aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
double nops = 2.0 * static_cast<double>(NITER) * m * n * k;
double ttot = 0.0;
@@ -163,9 +160,7 @@ void performance_test() {
cout << setw(16) << runType << ", " << fixed << setw(5) << setprecision(1)
<< nops / ttot << endl;
- for (auto i = 0; i < Cfp32_mkl.size(); ++i) {
- Cint32_mkl[i] = static_cast<int32_t>(Cfp32_mkl[i]);
- }
+ Cint32_mkl.assign(Cfp32_mkl.begin(), Cfp32_mkl.end());
#endif
for (BenchmarkType bench_type :
@@ -179,23 +174,19 @@ void performance_test() {
bench_type == BenchmarkType::REQUANTIZATION)
? 0
: -30;
- for (auto i = 0; i < Bfp32.size(); ++i) {
- Bint8[i] = static_cast<int8_t>(Bfp32[i]);
- }
// computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(n);
+ vector<int32_t> col_offsets(n);
+ Bint8 = Bint8_copy;
col_offsets_with_zero_pt_s8acc32_ref(
- k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+ k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
+ vector<int32_t> row_offsets(m);
row_offsets_u8acc32_ref(m, k, k, Aint8.data(), row_offsets.data());
float C_multiplier =
- (bench_type == BenchmarkType::BARE_BONE) ? 1 : 0.1234;
+ (bench_type == BenchmarkType::BARE_BONE) ? 1.0f : 0.1234f;
int32_t C_zero_pt = (bench_type == BenchmarkType::BARE_BONE) ? 0 : 5;
// printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -235,16 +226,14 @@ void performance_test() {
n,
Cint32_mkl.data(),
Cint8_mkl.data(),
- C_multiplier,
+ &C_multiplier,
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ &Bint8_zero_point,
row_offsets.data(),
col_offsets.data(),
- nullptr); // bias
-
- PackBMatrix<int8_t, int16_t> packedB(
- matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
+ nullptr, // bias
+ n); // ncols per quant group
CompressedSparseColumn B_csc(k, n);
@@ -254,30 +243,35 @@ void performance_test() {
default_random_engine eng;
binomial_distribution<> per_col_nnz_dist(k, density);
- vector<int> row_indices(k);
-
- int total_nnz = 0;
- for (int j = 0; j < n; ++j) {
- B_csc.ColPtr()[j] = total_nnz;
-
- int nnz_of_j = per_col_nnz_dist(eng);
- total_nnz += nnz_of_j;
-
- iota(row_indices.begin(), row_indices.end(), 0);
- shuffle(row_indices.begin(), row_indices.end(), eng);
- sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
-
- for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
- B_csc.RowIdx().push_back(row_indices[kidx]);
- // put the current B value
- B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
- // make current B value zero
- Bint8[row_indices[kidx] * n + j] = 0;
- // std::cout << "(" << row_indices[kidx] << ", " << j << ")" <<
- // endl;
+ if (bench_type == BenchmarkType::EVERYTHING) {
+ vector<int> row_indices(k);
+
+ int total_nnz = 0;
+ for (int j = 0; j < n; ++j) {
+ B_csc.ColPtr()[j] = total_nnz;
+
+ int nnz_of_j = per_col_nnz_dist(eng);
+ total_nnz += nnz_of_j;
+
+ iota(row_indices.begin(), row_indices.end(), 0);
+ shuffle(row_indices.begin(), row_indices.end(), eng);
+ sort(row_indices.begin(), row_indices.begin() + nnz_of_j);
+
+ for (int kidx = 0; kidx < nnz_of_j; ++kidx) {
+ B_csc.RowIdx().push_back(row_indices[kidx]);
+ // put the current B value
+ B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]);
+ // make current B value zero
+ Bint8[row_indices[kidx] * n + j] = 0;
+ // std::cout << "(" << row_indices[kidx] << ", " << j << ")" <<
+ // endl;
+ }
}
+ B_csc.ColPtr()[n] = total_nnz;
}
- B_csc.ColPtr()[n] = total_nnz;
+
+ PackBMatrix<int8_t, int16_t> packedB(
+ matrix_op_t::NoTranspose, k, n, Bint8.data(), n);
// printMatrix(matrix_op_t::NoTranspose,
// Cint32_mkl.data(), m, n, n, "C mkl");
@@ -298,8 +292,7 @@ void performance_test() {
#pragma omp parallel
#endif
{
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
PackAMatrix<uint8_t, int16_t> packA(
@@ -333,15 +326,16 @@ void performance_test() {
// Requantization back to int8
ReQuantizeOutput<false> reqObj(
doNothingObj,
- C_multiplier,
+ &C_multiplier,
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ &Bint8_zero_point,
bench_type == BenchmarkType::REQUANTIZATION
? nullptr
: packAWithRowOffset.getRowOffsetBuffer(),
col_offsets.data(),
- nullptr);
+ nullptr,
+ n);
// the top most (first) operation in the output processing
// pipeline is spmdm
@@ -354,13 +348,8 @@ void performance_test() {
ReQuantizeOutput<false>>
spmdmObj(reqObj, Aint8.data(), k, B_csc);
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
// printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
switch (bench_type) {
case BenchmarkType::BARE_BONE:
diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc
index b3a9b38..b255b8c 100644
--- a/bench/PackedRequantizeAcc32Benchmark.cc
+++ b/bench/PackedRequantizeAcc32Benchmark.cc
@@ -103,42 +103,35 @@ void performance_test() {
int k = shape[2];
float alpha = 1.f, beta = 0.f;
- aligned_vector<float> Afp32(m * k, 0.0f);
- aligned_vector<uint8_t> Aint8(m * k, 0);
+ aligned_vector<uint8_t> Aint8(m * k);
- aligned_vector<float> Bfp32(k * n, 0.0f);
- aligned_vector<int8_t> Bint8(k * n, 0);
+ aligned_vector<int8_t> Bint8(k * n);
- aligned_vector<float> Cfp32_mkl(m * n, 0.0f);
- aligned_vector<int32_t> Cint32_mkl(m * n, 0.0f);
- aligned_vector<int32_t> Cint32_fb(m * n, 0);
- aligned_vector<uint8_t> Cint8_fb(m * n, 0);
- aligned_vector<int32_t> Cint32_local(m * n, 0);
- aligned_vector<int32_t> Cint32_buffer(m * n, 0);
- aligned_vector<uint8_t> Cint8_local(m * n, 0);
+ aligned_vector<float> Cfp32_mkl(m * n);
+ aligned_vector<int32_t> Cint32_mkl(Cfp32_mkl.size());
+ aligned_vector<int32_t> Cint32_fb(Cfp32_mkl.size());
+ aligned_vector<uint8_t> Cint8_fb(Cfp32_mkl.size());
+ aligned_vector<int32_t> Cint32_local(Cfp32_mkl.size());
+ aligned_vector<int32_t> Cint32_buffer(Cfp32_mkl.size());
+ aligned_vector<uint8_t> Cint8_local(Cfp32_mkl.size());
// A matrix
- randFill(Aint8, 0, 255);
+ randFill<uint8_t>(Aint8, 0, 255);
// float Aint8_scale = 0.11;
int32_t Aint8_zero_point = 43;
- for (auto i = 0; i < Afp32.size(); ++i) {
- Afp32[i] = (float)Aint8[i];
- }
+ aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
- randFill(Bint8, -128, 127);
+ randFill<int8_t>(Bint8, -128, 127);
avoidOverflow(m, n, k, Aint8.data(), Bint8.data());
// float Bint8_scale = 0.49;
int32_t Bint8_zero_point = -30;
- for (auto i = 0; i < Bfp32.size(); ++i) {
- Bfp32[i] = (float)Bint8[i];
- }
+ aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
// computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(n);
+ vector<int32_t> col_offsets(n);
col_offsets_with_zero_pt_s8acc32_ref(
- k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data());
+ k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n);
double nops = 2.0 * static_cast<double>(NITER) * m * n * k;
double ttot = 0.0;
@@ -180,8 +173,7 @@ void performance_test() {
}
#endif
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
+ vector<int32_t> row_offsets(m);
float C_multiplier = 0.1234;
int32_t C_zero_pt = 5;
@@ -197,13 +189,14 @@ void performance_test() {
n,
Cint32_local.data(),
Cint8_local.data(),
- C_multiplier,
+ &C_multiplier,
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ &Bint8_zero_point,
row_offsets.data(),
col_offsets.data(),
- nullptr); // bias
+ nullptr, // bias
+ n); // ncols per quant group
// printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B
// unpacked");
// printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k,
@@ -248,8 +241,7 @@ void performance_test() {
#pragma omp parallel
#endif
{
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
PackAWithRowOffset<uint8_t> packAN(
@@ -265,21 +257,17 @@ void performance_test() {
DoNothing<> doNothingObj{};
ReQuantizeOutput<false> outputProcObj(
doNothingObj,
- C_multiplier,
+ &C_multiplier,
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ &Bint8_zero_point,
packAN.getRowOffsetBuffer(),
col_offsets.data(),
- nullptr);
+ nullptr,
+ n);
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
// printf ( "tid: %d, num_threads: %d\n", tid, num_threads );
fbgemmPacked(
packAN,
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index d2c2a1e..17a07e5 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -847,13 +847,19 @@ class DoSpmdmOnInpBuffer {
const int groups_;
};
+enum class QuantizationGranularity {
+ TENSOR,
+ GROUP,
+ OUT_CHANNEL,
+};
+
/**
* @brief Requantize values in inp buffer and write to out buffer.
* pass the out buffer to next op for further processing.
- *
*/
template <
bool FUSE_RELU,
+ QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
typename outT = std::uint8_t,
typename inT = std::int32_t,
typename nextOPType = DoNothing<outT, outT>>
@@ -863,13 +869,15 @@ class ReQuantizeOutput {
using inpType = inT;
ReQuantizeOutput(
nextOPType& nextop,
- float C_multiplier,
+ const float* C_multiplier,
std::int32_t C_zero_point,
std::int32_t Aq_zero_point,
- std::int32_t Bq_zero_point,
+ const std::int32_t* Bq_zero_point,
const std::int32_t* row_offsets,
const std::int32_t* col_offsets,
- const std::int32_t* bias)
+ const std::int32_t* bias,
+ std::uint32_t nCol,
+ int groups = 1)
: nextop_(nextop),
C_multiplier_(C_multiplier),
C_zero_point_(C_zero_point),
@@ -877,7 +885,9 @@ class ReQuantizeOutput {
Bq_zero_point_(Bq_zero_point),
q_row_offsets_(row_offsets),
q_col_offsets_(col_offsets),
- bias_(bias) {}
+ bias_(bias),
+ ncols_(nCol),
+ groups_(groups) {}
template <inst_set_t instSet>
inline int f(
@@ -897,13 +907,15 @@ class ReQuantizeOutput {
int ld_in) const;
nextOPType& nextop_;
- float C_multiplier_;
+ const float* C_multiplier_;
std::int32_t C_zero_point_;
std::int32_t Aq_zero_point_;
- std::int32_t Bq_zero_point_;
+ const std::int32_t* Bq_zero_point_;
const std::int32_t* q_row_offsets_;
const std::int32_t* q_col_offsets_;
const std::int32_t* bias_;
+ std::uint32_t ncols_;
+ int groups_;
};
/**
@@ -912,6 +924,7 @@ class ReQuantizeOutput {
*/
template <
bool FUSE_RELU,
+ QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
typename outT = float,
typename inT = std::int32_t,
typename nextOPType = DoNothing<outT, outT>>
@@ -922,12 +935,14 @@ class ReQuantizeForFloat {
ReQuantizeForFloat(
nextOPType& nextop,
float Aq_scale,
- float Bq_scale,
+ const float* Bq_scale,
std::int32_t Aq_zero_point,
- std::int32_t Bq_zero_point,
+ const std::int32_t* Bq_zero_point,
const std::int32_t* row_offsets,
const std::int32_t* col_offsets,
- const float* bias)
+ const float* bias,
+ std::uint32_t nCol,
+ int groups = 1)
: nextop_(nextop),
Aq_scale_(Aq_scale),
Bq_scale_(Bq_scale),
@@ -935,7 +950,9 @@ class ReQuantizeForFloat {
Bq_zero_point_(Bq_zero_point),
q_row_offsets_(row_offsets),
q_col_offsets_(col_offsets),
- bias_(bias) {}
+ bias_(bias),
+ ncols_(nCol),
+ groups_(groups) {}
template <inst_set_t instSet>
inline int f(
@@ -947,12 +964,15 @@ class ReQuantizeForFloat {
private:
nextOPType& nextop_;
- float Aq_scale_, Bq_scale_;
+ float Aq_scale_;
+ const float* Bq_scale_;
std::int32_t Aq_zero_point_;
- std::int32_t Bq_zero_point_;
+ const std::int32_t* Bq_zero_point_;
const std::int32_t* q_row_offsets_;
const std::int32_t* q_col_offsets_;
const float* bias_;
+ std::uint32_t ncols_;
+ int groups_;
};
// type specialized implementation in an include file
diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h
index 59a6e0e..88a10bc 100644
--- a/include/fbgemm/OutputProcessing-inl.h
+++ b/include/fbgemm/OutputProcessing-inl.h
@@ -44,9 +44,14 @@ inline int DoSpmdmOnInpBuffer<outT, inT, nextOPType>::f(
return nextop_.template f<instSet>(out, inp, block, ld_out, ld_in);
}
-template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType>
+template <
+ bool FUSE_RELU,
+ QuantizationGranularity Q_GRAN,
+ typename outT,
+ typename inT,
+ typename nextOPType>
template <bool A_SYMMETRIC, bool B_SYMMETRIC, bool HAS_BIAS>
-void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
+void ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f_(
outT* out,
const inT* inp,
const block_type_t& block,
@@ -54,7 +59,13 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
int ld_in) const {
// Adoption of implementation at QNNPACK/src/requantization/fp32-sse2.c
// using AVX2 instructions
- __m256 multiplier_v = _mm256_set1_ps(C_multiplier_);
+ int quant_param_idx = 0;
+ if (Q_GRAN == QuantizationGranularity::GROUP) {
+ int ncol_per_group = ncols_ / groups_;
+ int g = block.col_start / ncol_per_group;
+ quant_param_idx = g;
+ }
+ __m256 multiplier_v = _mm256_set1_ps(C_multiplier_[quant_param_idx]);
__m256i min_v = _mm256_set1_epi8(std::numeric_limits<uint8_t>::min());
__m256i max_v = _mm256_set1_epi8(std::numeric_limits<uint8_t>::max());
@@ -63,7 +74,9 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
(A_SYMMETRIC == (Aq_zero_point_ == 0)) &&
"A_SYMMETRIC == true if and only if Aq_zero_point == 0");
assert(
- (B_SYMMETRIC == (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr)) &&
+ (B_SYMMETRIC ==
+ ((Q_GRAN == QuantizationGranularity::TENSOR && Bq_zero_point_[0] == 0) ||
+ q_row_offsets_ == nullptr)) &&
"B_SYMMETRIC == true if and only if Bq_zero_point == 0 "
"or q_row_offsets_ == nullptr");
assert(
@@ -79,10 +92,22 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
constexpr int VLEN = 8;
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
- std::int32_t row_offset = (q_row_offsets_ && !B_SYMMETRIC)
- ? q_row_offsets_[i - block.row_start] * Bq_zero_point_
- : 0;
+ // Scale row_offset with Bq_zero_point
+ int32_t row_offset = 0;
+ if (B_SYMMETRIC) {
+ row_offset = 0;
+ } else if (
+ Q_GRAN == QuantizationGranularity::TENSOR ||
+ Q_GRAN == QuantizationGranularity::GROUP) {
+ row_offset =
+ q_row_offsets_[i - block.row_start] * Bq_zero_point_[quant_param_idx];
+ } else {
+ assert(
+ Q_GRAN == QuantizationGranularity::OUT_CHANNEL &&
+ "unknown quantization granularity");
+ }
__m256i row_offset_v = _mm256_set1_epi32(row_offset);
+
int j = block.col_start;
for (; j < block.col_start + (block.col_size / (VLEN * 4) * (VLEN * 4));
j += (VLEN * 4)) {
@@ -122,9 +147,33 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
}
if (!B_SYMMETRIC) {
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ row_offset_v = _mm256_mullo_epi32(
+ _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(Bq_zero_point_ + j)));
+ }
x_v = _mm256_sub_epi32(x_v, row_offset_v);
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ row_offset_v = _mm256_mullo_epi32(
+ _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(Bq_zero_point_ + j + VLEN)));
+ }
y_v = _mm256_sub_epi32(y_v, row_offset_v);
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ row_offset_v = _mm256_mullo_epi32(
+ _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+ _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
+ Bq_zero_point_ + j + 2 * VLEN)));
+ }
z_v = _mm256_sub_epi32(z_v, row_offset_v);
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ row_offset_v = _mm256_mullo_epi32(
+ _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+ _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
+ Bq_zero_point_ + j + 3 * VLEN)));
+ }
w_v = _mm256_sub_epi32(w_v, row_offset_v);
}
if (HAS_BIAS) {
@@ -157,10 +206,24 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
* representation as an FP32 value, and will be rounded to nearest
* FP32 value with ties to even with default MXCSR rounding mode.
*/
- __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
- __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v);
- __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v);
- __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v);
+ __m256 x_scaled_v, y_scaled_v, z_scaled_v, w_scaled_v;
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ x_scaled_v = _mm256_mul_ps(
+ _mm256_cvtepi32_ps(x_v), _mm256_loadu_ps(C_multiplier_ + j));
+ y_scaled_v = _mm256_mul_ps(
+ _mm256_cvtepi32_ps(y_v), _mm256_loadu_ps(C_multiplier_ + j + VLEN));
+ z_scaled_v = _mm256_mul_ps(
+ _mm256_cvtepi32_ps(z_v),
+ _mm256_loadu_ps(C_multiplier_ + j + 2 * VLEN));
+ w_scaled_v = _mm256_mul_ps(
+ _mm256_cvtepi32_ps(w_v),
+ _mm256_loadu_ps(C_multiplier_ + j + 3 * VLEN));
+ } else {
+ x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+ y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v);
+ z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v);
+ w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v);
+ }
/*
* Convert scaled FP32 result to int32_t using CVTPS2DQ instruction.
@@ -238,6 +301,12 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
}
if (!B_SYMMETRIC) {
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ row_offset_v = _mm256_mullo_epi32(
+ _mm256_set1_epi32(q_row_offsets_[i - block.row_start]),
+ _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(Bq_zero_point_ + j)));
+ }
x_v = _mm256_sub_epi32(x_v, row_offset_v);
}
if (HAS_BIAS) {
@@ -246,7 +315,13 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(bias_ + j)));
}
- __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+ __m256 x_scaled_v;
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ x_scaled_v = _mm256_mul_ps(
+ _mm256_cvtepi32_ps(x_v), _mm256_loadu_ps(C_multiplier_ + j));
+ } else {
+ x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v);
+ }
__m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
__m256i x_packed_v = _mm256_adds_epi16(
@@ -281,19 +356,26 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
_mm256_castsi256_si128(x_clamped_v));
} // j loop vectorized
+ // TODO: vectorize remainder using masking
for (; j < block.col_start + block.col_size; ++j) {
int32_t raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)];
if (!A_SYMMETRIC) {
raw -= Aq_zero_point_ * q_col_offsets_[j];
}
if (!B_SYMMETRIC) {
+ if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ row_offset = q_row_offsets_[i - block.row_start] * Bq_zero_point_[j];
+ }
raw -= row_offset;
}
if (HAS_BIAS) {
raw += bias_[j];
}
- float ab = raw * C_multiplier_;
+ float ab = raw *
+ ((Q_GRAN == QuantizationGranularity::OUT_CHANNEL)
+ ? C_multiplier_[j]
+ : C_multiplier_[quant_param_idx]);
long rounded = std::lrintf(ab) + C_zero_point_;
out[i * ld_out + j] = std::max(
@@ -303,9 +385,14 @@ void ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f_(
} // i loop
}
-template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType>
+template <
+ bool FUSE_RELU,
+ QuantizationGranularity Q_GRAN,
+ typename outT,
+ typename inT,
+ typename nextOPType>
template <inst_set_t instSet>
-inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
+inline int ReQuantizeOutput<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
outT* out,
const inT* inp,
const block_type_t& block,
@@ -314,19 +401,35 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
static_assert(
std::is_same<inT, int32_t>::value,
"input data type must be of int32_t type");
+ int ncol_per_group = ncols_ / groups_;
+ assert(
+ block.col_size <= ncol_per_group &&
+ "ReQuantizeOutput should be called at most 1 group at a time.");
+ int g = block.col_start / ncol_per_group;
if (instSet == inst_set_t::anyarch) {
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)];
raw -= Aq_zero_point_ * q_col_offsets_[j];
+ int Bq_zero_point_idx;
+ if (Q_GRAN == QuantizationGranularity::TENSOR) {
+ Bq_zero_point_idx = 0;
+ } else if (Q_GRAN == QuantizationGranularity::GROUP) {
+ Bq_zero_point_idx = g;
+ } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ Bq_zero_point_idx = j;
+ } else {
+ assert(false && "unknown quantization granularity");
+ }
if (q_row_offsets_) {
- raw -= q_row_offsets_[i - block.row_start] * Bq_zero_point_;
+ raw -= q_row_offsets_[i - block.row_start] *
+ Bq_zero_point_[Bq_zero_point_idx];
}
if (bias_) {
raw += bias_[j];
}
- float ab = raw * C_multiplier_;
+ float ab = raw * C_multiplier_[Bq_zero_point_idx];
long rounded = std::lrintf(ab) + C_zero_point_;
out[i * ld_out + j] = std::max(
@@ -336,8 +439,11 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
}
} else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) {
if (std::is_same<outT, uint8_t>::value) {
+ bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR &&
+ Bq_zero_point_[0] == 0) ||
+ q_row_offsets_ == nullptr;
if (Aq_zero_point_ == 0) {
- if (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr) {
+ if (b_symmetric) {
if (bias_ == nullptr) {
f_<true, true, false>(out, inp, block, ld_out, ld_in);
} else {
@@ -351,7 +457,7 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
}
}
} else {
- if (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr) {
+ if (b_symmetric) {
if (bias_ == nullptr) {
f_<false, true, false>(out, inp, block, ld_out, ld_in);
} else {
@@ -374,9 +480,14 @@ inline int ReQuantizeOutput<FUSE_RELU, outT, inT, nextOPType>::f(
return nextop_.template f<instSet>(out, out, block, ld_out, ld_out);
}
-template <bool FUSE_RELU, typename outT, typename inT, typename nextOPType>
+template <
+ bool FUSE_RELU,
+ QuantizationGranularity Q_GRAN,
+ typename outT,
+ typename inT,
+ typename nextOPType>
template <inst_set_t instSet>
-inline int ReQuantizeForFloat<FUSE_RELU, outT, inT, nextOPType>::f(
+inline int ReQuantizeForFloat<FUSE_RELU, Q_GRAN, outT, inT, nextOPType>::f(
outT* out,
inT* inp,
const block_type_t& block,
@@ -388,12 +499,28 @@ inline int ReQuantizeForFloat<FUSE_RELU, outT, inT, nextOPType>::f(
static_assert(
std::is_same<float, outT>::value,
"output data type is of not expected type");
+ int ncol_per_group = ncols_ / groups_;
+ assert(
+ block.col_size <= ncol_per_group &&
+ "ReQuantizeOutput should be called at most 1 group at a time.");
+ int g = block.col_start / ncol_per_group;
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
for (int j = block.col_start; j < block.col_start + block.col_size; ++j) {
inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start];
raw -= Aq_zero_point_ * q_col_offsets_[j];
- raw -= q_row_offsets_[i - block.row_start] * Bq_zero_point_;
- float res = raw * Aq_scale_ * Bq_scale_;
+ int Bq_zero_point_idx;
+ if (Q_GRAN == QuantizationGranularity::TENSOR) {
+ Bq_zero_point_idx = 0;
+ } else if (Q_GRAN == QuantizationGranularity::GROUP) {
+ Bq_zero_point_idx = g;
+ } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ Bq_zero_point_idx = j;
+ } else {
+ assert(false && "unknown quantization granularity");
+ }
+ raw -= q_row_offsets_[i - block.row_start] *
+ Bq_zero_point_[Bq_zero_point_idx];
+ float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx];
if (bias_) {
res += bias_[j];
}
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index 2e2035c..f1ec882 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -240,47 +240,60 @@ void ExecuteKernel<
} // for each j block
}
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- uint8_t,
- ReQuantizeOutput<false /* FUSE_RELU*/>>;
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- uint8_t,
- ReQuantizeOutput<true>>;
-
-template class ExecuteKernel<
- PackAWithQuantRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- float,
- ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
- PackAWithQuantRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- float,
- ReQuantizeForFloat<true>>;
-
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- float,
- ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- float,
- ReQuantizeForFloat<true>>;
-
-template class ExecuteKernel<
- PackAMatrix<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- int32_t,
- memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeOutput
+#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithRowOffset<uint8_t, ACC_T>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ uint8_t, \
+ ReQuantizeOutput<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \
+ INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(ACC_T) \
+ INSTANTIATE_Q_GRANS(ACC_T, false); \
+ INSTANTIATE_Q_GRANS(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ uint8_t, \
+ ReQuantizeOutput<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T) \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
template class ExecuteKernel<
PackAMatrix<uint8_t, int16_t>,
@@ -288,110 +301,127 @@ template class ExecuteKernel<
uint8_t,
ReQuantizeOutput<false>>;
-template class ExecuteKernel<
- PackAMatrix<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- int32_t,
- memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeForFloat
+#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, int32_t>, \
+ PackBMatrix<int8_t, int32_t>, \
+ float, \
+ ReQuantizeForFloat<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(PACK_A) \
+ INSTANTIATE_Q_GRANS(PACK_A, false); \
+ INSTANTIATE_Q_GRANS(PACK_A, true);
+
+INSTANTIATE_RELU(PackAWithRowOffset);
+INSTANTIATE_RELU(PackAWithQuantRowOffset);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ float, \
+ ReQuantizeForFloat<RELU, Q_GRAN>>;
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T) \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
template class ExecuteKernel<
PackAWithRowOffset<uint8_t, int16_t>,
PackBMatrix<int8_t, int16_t>,
- uint8_t,
- DoSpmdmOnInpBuffer<
- ReQuantizeOutput<false>::outType,
- int32_t,
- ReQuantizeOutput<false>>>;
+ float,
+ ReQuantizeForFloat<false /* FUSE_RELU*/>>;
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- uint8_t,
- DoSpmdmOnInpBuffer<
- ReQuantizeOutput<true>::outType,
- int32_t,
- ReQuantizeOutput<true>>>;
+////////////////////////////////////////////////////////////////////////////////
+// DoSpmdmOnInpBuffer
+#define INSTANTIATE_BASE(RELU, Q_GRAN) \
+ template class ExecuteKernel< \
+ PackAWithRowOffset<uint8_t, int16_t>, \
+ PackBMatrix<int8_t, int16_t>, \
+ uint8_t, \
+ DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<RELU, Q_GRAN>>>;
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- float,
- DoSpmdmOnInpBuffer<
- ReQuantizeForFloat<false>::outType,
- int32_t,
- ReQuantizeForFloat<false>>>;
+#define INSTANTIATE_Q_GRANS(RELU) \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- uint8_t,
- ReQuantizeOutput<false>>;
+INSTANTIATE_Q_GRANS(false);
+INSTANTIATE_Q_GRANS(true);
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- uint8_t,
- ReQuantizeOutput<true>>;
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
template class ExecuteKernel<
PackAWithRowOffset<uint8_t, int16_t>,
PackBMatrix<int8_t, int16_t>,
- int32_t,
- memCopy<>>;
+ float,
+ DoSpmdmOnInpBuffer<float, int32_t, ReQuantizeForFloat<false>>>;
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- int32_t,
- memCopy<>>;
+////////////////////////////////////////////////////////////////////////////////
+// memCopy
+#define INSTANTIATE_BASE(PACK_A, ACC_T) \
+ template class ExecuteKernel< \
+ PACK_A<uint8_t, ACC_T>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ int32_t, \
+ memCopy<>>;
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int16_t, 3>,
- PackBMatrix<int8_t, int16_t>,
- int32_t,
- memCopy<>>;
+#define INSTANTIATE_ACC_T(PACK_A) \
+ INSTANTIATE_BASE(PACK_A, int32_t) \
+ INSTANTIATE_BASE(PACK_A, int16_t)
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- uint8_t,
- ReQuantizeOutput<false>>;
+INSTANTIATE_ACC_T(PackAMatrix);
+INSTANTIATE_ACC_T(PackAWithRowOffset);
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int16_t, 3>,
- PackBMatrix<int8_t, int16_t>,
- uint8_t,
- ReQuantizeOutput<false>>;
+#undef INSTANTIATE_ACC_T
+#undef INSTANTIATE_BASE
-template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- int32_t,
- memCopy<>>;
-
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- int32_t,
- memCopy<>>;
+#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \
+ template class ExecuteKernel< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ PackBMatrix<int8_t, ACC_T>, \
+ int32_t, \
+ memCopy<>>;
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int32_t, 3>,
- PackBMatrix<int8_t, int32_t>,
- int32_t,
- memCopy<>>;
+#define INSTANTIATE_SPATIAL_DIM(ACC_T) \
+ INSTANTIATE_BASE(ACC_T, 2); \
+ INSTANTIATE_BASE(ACC_T, 3);
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int32_t>,
- PackBMatrix<int8_t, int32_t>,
- uint8_t,
- ReQuantizeOutput<false>>;
+INSTANTIATE_SPATIAL_DIM(int32_t);
+INSTANTIATE_SPATIAL_DIM(int16_t);
-template class ExecuteKernel<
- PackAWithIm2Col<uint8_t, int32_t, 3>,
- PackBMatrix<int8_t, int32_t>,
- uint8_t,
- ReQuantizeOutput<false>>;
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BASE
template class ExecuteKernel<
PackAWithQuantRowOffset<uint8_t, int32_t>,
@@ -400,12 +430,6 @@ template class ExecuteKernel<
memCopy<>>;
template class ExecuteKernel<
- PackAWithRowOffset<uint8_t, int16_t>,
- PackBMatrix<int8_t, int16_t>,
- float,
- ReQuantizeForFloat<false>>;
-
-template class ExecuteKernel<
PackAMatrix<uint8_t, int16_t>,
PackBMatrix<int8_t, int16_t>,
int32_t,
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index 0039daf..a8bf02f 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -198,149 +198,70 @@ bool fbgemmSupportedCPU() {
return (cpuinfo_initialize() && cpuinfo_has_x86_avx2());
}
-template void fbgemmPacked(
- PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- uint8_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeOutput<false>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- uint8_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeOutput<true>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
- packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- float* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeForFloat<false>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
- packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- float* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeForFloat<true>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAMatrix<uint8_t, int32_t>, uint8_t, int32_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- int32_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const memCopy<>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- float* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeForFloat<false>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- float* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeForFloat<true>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithRowOffset<uint8_t, int32_t>, uint8_t, int32_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- int32_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const memCopy<>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- int32_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const memCopy<>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithIm2Col<uint8_t, int32_t, 3>, uint8_t, int32_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- int32_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const memCopy<>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithIm2Col<uint8_t, int32_t>, uint8_t, int32_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- uint8_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeOutput<false>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithIm2Col<uint8_t, int32_t, 3>, uint8_t, int32_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- uint8_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeOutput<false>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
- packA,
- PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
- int32_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const memCopy<>& outProcess,
- int thread_id,
- int num_threads);
-
-// 16 bit accumulation functions
-template void fbgemmPacked(
- PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
- int32_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const memCopy<>& outProcess,
- int thread_id,
- int num_threads);
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeOutput
+#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \
+ template void fbgemmPacked( \
+ PackMatrix<PackAWithRowOffset<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \
+ PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+ uint8_t* C, \
+ int32_t* C_buffer, \
+ uint32_t ldc, \
+ const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \
+ int thread_id, \
+ int num_threads);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \
+ INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(ACC_T) \
+ INSTANTIATE_Q_GRANS(ACC_T, false); \
+ INSTANTIATE_Q_GRANS(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ template void fbgemmPacked( \
+ PackMatrix< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ uint8_t, \
+ ACC_T>& packA, \
+ PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+ uint8_t* C, \
+ int32_t* C_buffer, \
+ uint32_t ldc, \
+ const ReQuantizeOutput<RELU, Q_GRAN>& outProcess, \
+ int thread_id, \
+ int num_threads);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T) \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
template void fbgemmPacked(
PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
@@ -352,28 +273,109 @@ template void fbgemmPacked(
int thread_id,
int num_threads);
-template void fbgemmPacked(
- PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
- uint8_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<false>>&
- outProcess,
- int thread_id,
- int num_threads);
+////////////////////////////////////////////////////////////////////////////////
+// ReQuantizeForFloat
+#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \
+ template void fbgemmPacked( \
+ PackMatrix<PACK_A<uint8_t, int32_t>, uint8_t, int32_t>& packA, \
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB, \
+ float* C, \
+ int32_t* C_buffer, \
+ uint32_t ldc, \
+ const ReQuantizeForFloat<RELU, Q_GRAN>& outProcess, \
+ int thread_id, \
+ int num_threads);
+
+#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_RELU(PACK_A) \
+ INSTANTIATE_Q_GRANS(PACK_A, false); \
+ INSTANTIATE_Q_GRANS(PACK_A, true);
+
+INSTANTIATE_RELU(PackAWithRowOffset);
+INSTANTIATE_RELU(PackAWithQuantRowOffset);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \
+ template void fbgemmPacked( \
+ PackMatrix< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ uint8_t, \
+ ACC_T>& packA, \
+ PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+ float* C, \
+ int32_t* C_buffer, \
+ uint32_t ldc, \
+ const ReQuantizeForFloat<RELU, Q_GRAN>& outProcess, \
+ int thread_id, \
+ int num_threads);
+
+#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE( \
+ ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \
+ INSTANTIATE_Q_GRANS(ACC_T, RELU, 3);
+
+#define INSTANTIATE_RELU(ACC_T) \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, false); \
+ INSTANTIATE_SPATIAL_DIM(ACC_T, true);
+
+INSTANTIATE_RELU(int32_t);
+INSTANTIATE_RELU(int16_t);
+
+#undef INSTANTIATE_RELU
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
template void fbgemmPacked(
PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
- uint8_t* C,
+ float* C,
int32_t* C_buffer,
uint32_t ldc,
- const DoSpmdmOnInpBuffer<uint8_t, int32_t, ReQuantizeOutput<true>>&
- outProcess,
+ const ReQuantizeForFloat<false>& outProcess,
int thread_id,
int num_threads);
+////////////////////////////////////////////////////////////////////////////////
+// DoSpmdmOnInpBuffer
+#define INSTANTIATE_BASE(RELU, Q_GRAN) \
+ template void fbgemmPacked( \
+ PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& \
+ packA, \
+ PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB, \
+ uint8_t* C, \
+ int32_t* C_buffer, \
+ uint32_t ldc, \
+ const DoSpmdmOnInpBuffer< \
+ uint8_t, \
+ int32_t, \
+ ReQuantizeOutput<RELU, Q_GRAN>>& outProcess, \
+ int thread_id, \
+ int num_threads);
+
+#define INSTANTIATE_Q_GRANS(RELU) \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \
+ INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL);
+
+INSTANTIATE_Q_GRANS(false);
+INSTANTIATE_Q_GRANS(true);
+
+#undef INSTANTIATE_Q_GRANS
+#undef INSTANTIATE_BASE
+
template void fbgemmPacked(
PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
@@ -385,49 +387,57 @@ template void fbgemmPacked(
int thread_id,
int num_threads);
-template void fbgemmPacked(
- PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
- uint8_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeOutput<false>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
- uint8_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeOutput<true>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
- int32_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const memCopy<>& outProcess,
- int thread_id,
- int num_threads);
+////////////////////////////////////////////////////////////////////////////////
+// memCopy
+#define INSTANTIATE_BASE(PACK_A, ACC_T) \
+ template void fbgemmPacked( \
+ PackMatrix<PACK_A<uint8_t, ACC_T>, uint8_t, ACC_T>& packA, \
+ PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+ int32_t* C, \
+ int32_t* C_buffer, \
+ uint32_t ldc, \
+ const memCopy<>& outProcess, \
+ int thread_id, \
+ int num_threads);
+
+#define INSTANTIATE_ACC_T(PACK_A) \
+ INSTANTIATE_BASE(PACK_A, int32_t) \
+ INSTANTIATE_BASE(PACK_A, int16_t)
+
+INSTANTIATE_ACC_T(PackAMatrix);
+INSTANTIATE_ACC_T(PackAWithRowOffset);
+
+#undef INSTANTIATE_ACC_T
+#undef INSTANTIATE_BASE
+
+#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \
+ template void fbgemmPacked( \
+ PackMatrix< \
+ PackAWithIm2Col<uint8_t, ACC_T, SPATIAL_DIM>, \
+ uint8_t, \
+ ACC_T>& packA, \
+ PackMatrix<PackBMatrix<int8_t, ACC_T>, int8_t, ACC_T>& packB, \
+ int32_t* C, \
+ int32_t* C_buffer, \
+ uint32_t ldc, \
+ const memCopy<>& outProcess, \
+ int thread_id, \
+ int num_threads);
+
+#define INSTANTIATE_SPATIAL_DIM(ACC_T) \
+ INSTANTIATE_BASE(ACC_T, 2); \
+ INSTANTIATE_BASE(ACC_T, 3);
+
+INSTANTIATE_SPATIAL_DIM(int32_t);
+INSTANTIATE_SPATIAL_DIM(int16_t);
+
+#undef INSTANTIATE_SPATIAL_DIM
+#undef INSTANTIATE_BASE
template void fbgemmPacked(
- PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
- int32_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const memCopy<>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithIm2Col<uint8_t, int16_t, 3>, uint8_t, int16_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
+ PackMatrix<PackAWithQuantRowOffset<uint8_t, int32_t>, uint8_t, int32_t>&
+ packA,
+ PackMatrix<PackBMatrix<int8_t, int32_t>, int8_t, int32_t>& packB,
int32_t* C,
int32_t* C_buffer,
uint32_t ldc,
@@ -436,26 +446,6 @@ template void fbgemmPacked(
int num_threads);
template void fbgemmPacked(
- PackMatrix<PackAWithIm2Col<uint8_t, int16_t>, uint8_t, int16_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
- uint8_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeOutput<false>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
- PackMatrix<PackAWithIm2Col<uint8_t, int16_t, 3>, uint8_t, int16_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
- uint8_t* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeOutput<false>& outProcess,
- int thread_id,
- int num_threads);
-
-template void fbgemmPacked(
PackMatrix<PackAMatrix<uint8_t, int16_t>, uint8_t, int16_t>& packA,
PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
int32_t* C,
@@ -465,14 +455,4 @@ template void fbgemmPacked(
int thread_id,
int num_threads);
-template void fbgemmPacked(
- PackMatrix<PackAWithRowOffset<uint8_t, int16_t>, uint8_t, int16_t>& packA,
- PackMatrix<PackBMatrix<int8_t, int16_t>, int8_t, int16_t>& packB,
- float* C,
- int32_t* C_buffer,
- uint32_t ldc,
- const ReQuantizeForFloat<false>& outProcess,
- int thread_id,
- int num_threads);
-
} // namespace fbgemm
diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc
index 322287b..50f619a 100644
--- a/src/QuantUtils.cc
+++ b/src/QuantUtils.cc
@@ -436,13 +436,14 @@ void RequantizeAvx2(
DoNothing<> doNothingObj{};
ReQuantizeOutput<false /* FUSE_RELU */> requantizeObj(
doNothingObj,
- params.real_multiplier,
+ &params.real_multiplier,
params.target_qparams.zero_point,
0,
0,
nullptr,
nullptr,
- nullptr);
+ nullptr,
+ len);
requantizeObj.f<inst_set_t::avx2>(dst, src, {0, 1, 0, len}, 0, 0);
}
#endif
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index 097e3b5..369aea3 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -57,24 +57,25 @@ void requantize_u8acc32_ref(
int ld,
const int32_t* inp,
uint8_t* out,
- float C_multiplier,
+ const float* C_multiplier,
int32_t C_zero_point,
int32_t A_zero_point,
- int32_t B_zero_point,
+ const int32_t* B_zero_point,
const int32_t* row_offsets,
const int32_t* col_offsets,
const int32_t* bias,
+ int ncols_per_quant_group,
bool fuse_relu) {
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
int32_t raw = inp[i * ld + j];
raw -= A_zero_point * col_offsets[j];
- raw -= B_zero_point * row_offsets[i];
+ raw -= B_zero_point[j / ncols_per_quant_group] * row_offsets[i];
if (bias) {
raw += bias[j];
}
- float result = raw * C_multiplier;
+ float result = raw * C_multiplier[j / ncols_per_quant_group];
long rounded = lrintf(result) + C_zero_point;
out[i * ld + j] = std::max(
fuse_relu ? static_cast<long>(C_zero_point) : 0l,
@@ -180,14 +181,15 @@ void col_offsets_with_zero_pt_s8acc32_ref(
int N,
int ld,
const int8_t* Bint8,
- int32_t B_zero_point,
- int32_t* col_offsets) {
+ const int32_t* B_zero_point,
+ int32_t* col_offsets,
+ int ncols_per_quant_group) {
for (int j = 0; j < N; ++j) {
int32_t sum = 0;
for (int k = 0; k < K; ++k) {
sum += Bint8[k * ld + j];
}
- col_offsets[j] = sum - B_zero_point * K;
+ col_offsets[j] = sum - B_zero_point[j / ncols_per_quant_group] * K;
}
}
@@ -578,13 +580,14 @@ void depthwise_3x3_pad_1_ref(
1,
C_int32.data() + i * K + k,
C + i * K + k,
- C_multiplier,
+ &C_multiplier,
C_zero_point,
A_zero_point,
- B_zero_point,
+ &B_zero_point,
&row_offsets[i * K + k],
col_offsets + k,
- bias ? bias + k : nullptr);
+ bias ? bias + k : nullptr,
+ 1);
}
}
};
@@ -644,13 +647,14 @@ void depthwise_3x3_per_channel_quantization_pad_1_ref(
1,
C_int32.data() + i * K + k,
C + i * K + k,
- C_multiplier[k],
+ &C_multiplier[k],
C_zero_point,
A_zero_point,
- B_zero_point[k],
+ &B_zero_point[k],
&row_offsets[i * K + k],
col_offsets + k,
- bias ? bias + k : nullptr);
+ bias ? bias + k : nullptr,
+ 1);
}
}
};
@@ -781,13 +785,14 @@ void depthwise_3x3x3_pad_1_ref(
1,
C_int32.data() + i * K + k,
C + i * K + k,
- C_multiplier,
+ &C_multiplier,
C_zero_point,
A_zero_point,
- B_zero_point,
+ &B_zero_point,
&row_offsets[i * K + k],
col_offsets + k,
- bias ? bias + k : nullptr);
+ bias ? bias + k : nullptr,
+ 1);
}
}
};
diff --git a/src/RefImplementations.h b/src/RefImplementations.h
index cec4bff..6530eff 100644
--- a/src/RefImplementations.h
+++ b/src/RefImplementations.h
@@ -39,6 +39,11 @@ void requantize_u8acc32_ref(
* @brief Reference implementation of requantization step.
* float multiplier
* @params bias can be nullptr
+ * @params ncols_per_quant_group the number of columns share the same
+ * quantization parameter.
+ * ncols_per_quant_group == N : per-tensor quantization
+ * ncols_per_quant_group == N / groups : per-group quantization
+ * ncols_per_quant_group == 1 : per-channel quantization
*/
void requantize_u8acc32_ref(
int M,
@@ -46,13 +51,14 @@ void requantize_u8acc32_ref(
int ld,
const std::int32_t* inp,
std::uint8_t* out,
- float C_multiplier,
+ const float* C_multiplier,
std::int32_t C_zero_point,
std::int32_t A_zero_point,
- std::int32_t B_zero_point,
+ const std::int32_t* B_zero_point,
const std::int32_t* row_offsets,
const std::int32_t* col_offsets,
const std::int32_t* bias,
+ int ncols_per_quant_group,
bool fuse_relu = false);
/**
@@ -114,14 +120,18 @@ void row_offsets_u8acc32_ref(
/**
* @brief Reference implementation to compute adjusted col_offsets (sum of
* columns of B and adjusted with B_zero_point)
+ *
+ * @params ncols_per_quant_group see ncols_per_quant_group in
+ * requantize_u8acc32_ref
*/
void col_offsets_with_zero_pt_s8acc32_ref(
int K,
int N,
int ld,
const std::int8_t* Bint8,
- std::int32_t B_zero_point,
- std::int32_t* col_offsets);
+ const std::int32_t* B_zero_point,
+ std::int32_t* col_offsets,
+ int ncols_per_quant_group);
/**
* @brief Reference implementation of SPMDM (sparse matrix times dense matrix).
diff --git a/test/FP16Test.cc b/test/FP16Test.cc
index b5e4f13..0edcc4b 100644
--- a/test/FP16Test.cc
+++ b/test/FP16Test.cc
@@ -73,19 +73,17 @@ TEST_P(FBGemmFP16Test, Test) {
}
cerr << endl;
- aligned_vector<float> A(m * k, 0.f);
- aligned_vector<float> B(k * n, 0.f);
+ // initialize with small numbers
+ aligned_vector<int> Aint(m * k);
+ aligned_vector<int> Bint(k * n);
+ randFill(Aint, 0, 4);
+ randFill(Bint, 0, 4);
+ aligned_vector<float> A(Aint.begin(), Aint.end());
+ aligned_vector<float> B(Bint.begin(), Bint.end());
+
aligned_vector<float> C(m * n, NAN);
- // initialize with small numbers
- randFill(A, 0, 4);
- randFill(B, 0, 4);
- randFill(C, 0, 4);
-
- aligned_vector<float> A_ref, B_ref, C_ref;
- A_ref = A;
- B_ref = B;
- C_ref = C;
+ aligned_vector<float> A_ref(A), B_ref(B), C_ref(C);
if (atrans == matrix_op_t::Transpose) {
transpose_matrix(A_ref.data(), k, m);
diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc
index 9a19f0f..f482783 100644
--- a/test/I8DepthwiseTest.cc
+++ b/test/I8DepthwiseTest.cc
@@ -85,10 +85,10 @@ TEST(FBGemmDepthWiseTest, Test3x3) {
aligned_vector<int8_t> B(K * R * S);
aligned_vector<int32_t> C_ref(N * H_OUT * W_OUT * K), C(C_ref.size());
- randFill(A, 0, 86);
+ randFill<uint8_t>(A, 0, 86);
int32_t A_zero_point = 43;
- randFill(B, -16, 16);
+ randFill<int8_t>(B, -16, 16);
int32_t B_zero_point = 5;
depthwise_3x3_pad_1_ref(
@@ -211,10 +211,10 @@ TEST(FBGemmDepthWiseTest, Test3x3x3) {
aligned_vector<int32_t> C_ref(N * T_OUT * H_OUT * W_OUT * K),
C(C_ref.size());
- randFill(A, 0, 86);
+ randFill<uint8_t>(A, 0, 86);
int32_t A_zero_point = 43;
- randFill(B, -16, 16);
+ randFill<int8_t>(B, -16, 16);
int32_t B_zero_point = 5;
depthwise_3x3x3_pad_1_ref(
@@ -360,7 +360,7 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) {
int32_t C_num_rows = N * H_OUT * W_OUT;
aligned_vector<int32_t> C_ref(C_num_rows * K), C(C_ref.size());
- randFill(A, 0, 86);
+ randFill<uint8_t>(A, 0, 86);
int32_t A_zero_point = 43;
// Each row of G has a different range to really test per-channel
@@ -368,7 +368,7 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) {
vector<int32_t> B_zero_point(K);
for (auto k = 0; k < K; ++k) {
aligned_vector<int8_t> Bk(R * S);
- randFill(Bk, -16 + k, 16 + k);
+ randFill<int8_t>(Bk, -16 + k, 16 + k);
copy(Bk.begin(), Bk.end(), B.begin() + k * R * S);
B_zero_point[k] = 5 + k;
diff --git a/test/I8SpmdmTest.cc b/test/I8SpmdmTest.cc
index 5bb7703..2090b63 100644
--- a/test/I8SpmdmTest.cc
+++ b/test/I8SpmdmTest.cc
@@ -66,7 +66,7 @@ TEST_P(fbgemmSPMDMTest, TestsSpMDM) {
}
aligned_vector<uint8_t> A(M * K);
- randFill(A, 0, 255);
+ randFill<uint8_t>(A, 0, 255);
CompressedSparseColumn B_csc(K_adjusted, N_adjusted);
vector<int32_t> C(M * N);
@@ -127,13 +127,8 @@ TEST_P(fbgemmSPMDMTest, TestsSpMDM) {
#pragma omp parallel
#endif
{
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
int i_per_thread = (M + num_threads - 1) / num_threads;
int i_begin = std::min(tid * i_per_thread, M);
int i_end = std::min(i_begin + i_per_thread, M);
diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
index c7de1a8..d8f3f7a 100644
--- a/test/Im2ColFusedRequantizeTest.cc
+++ b/test/Im2ColFusedRequantizeTest.cc
@@ -20,8 +20,22 @@
#include "src/RefImplementations.h"
using namespace std;
+using namespace fbgemm;
-namespace fbgemm {
+vector<QuantizationGranularity> qGranularityVals{
+ QuantizationGranularity::TENSOR,
+ QuantizationGranularity::GROUP,
+ QuantizationGranularity::OUT_CHANNEL};
+
+namespace {
+class fbgemmIm2colTest
+ : public testing::TestWithParam<QuantizationGranularity> {};
+}; // namespace
+
+INSTANTIATE_TEST_CASE_P(
+ InstantiationName,
+ fbgemmIm2colTest,
+ ::testing::ValuesIn(qGranularityVals));
// From Faster-RCNN with ShuffleNet
static vector<conv_param_t<>> shapes = {
@@ -71,7 +85,7 @@ static vector<conv_param_t<>> shapes = {
conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}),
};
-template <typename ACC_T>
+template <typename ACC_T, QuantizationGranularity Q_GRAN>
static void Im2colTest() {
for (auto conv_p : shapes) {
for (int groups : {1, 4}) {
@@ -80,29 +94,38 @@ static void Im2colTest() {
}
conv_p.G = groups;
aligned_vector<uint8_t> Aint8(
- conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0);
+ conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC);
aligned_vector<int8_t> Bint8(
- conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0);
+ conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC);
aligned_vector<int32_t> Cint32_ref(
- conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0);
- aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
- aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
- aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-
- int32_t Aint8_zero_point, Bint8_zero_point;
+ conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC);
+ aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+ aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+
+ int ncols_per_quant_group = conv_p.OC;
+ if (Q_GRAN == QuantizationGranularity::GROUP) {
+ ncols_per_quant_group = conv_p.OC / conv_p.G;
+ } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ ncols_per_quant_group = 1;
+ }
+ int32_t Aint8_zero_point;
+ aligned_vector<int32_t> Bint8_zero_point(
+ conv_p.OC / ncols_per_quant_group);
if (is_same<ACC_T, int32_t>::value) {
- randFill(Aint8, 0, 80);
+ randFill<uint8_t>(Aint8, 0, 80);
Aint8_zero_point = 43;
- randFill(Bint8, -16, 16);
- Bint8_zero_point = -30;
+ randFill<int8_t>(Bint8, -16, 16);
+ randFill(Bint8_zero_point, -50, -10);
} else {
- randFill(Aint8, 0, 5);
+ randFill<uint8_t>(Aint8, 0, 5);
Aint8_zero_point = 4;
- randFill(Bint8, -4, 4);
- Bint8_zero_point = -2;
+ randFill<int8_t>(Bint8, -4, 4);
+ randFill(Bint8_zero_point, -3, -1);
}
- float C_multiplier = 0.1234;
+ aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+ randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
int32_t C_zero_pt = 5;
int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1];
@@ -116,16 +139,16 @@ static void Im2colTest() {
im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
// computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(groups * NDim);
+ vector<int32_t> col_offsets(groups * NDim);
for (int g = 0; g < groups; ++g) {
col_offsets_with_zero_pt_s8acc32_ref(
KDimPerGroup,
NDim,
NDim,
Bint8.data() + g * KDimPerGroup * NDim,
- Bint8_zero_point,
- col_offsets.data() + g * NDim);
+ Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
+ col_offsets.data() + g * NDim,
+ ncols_per_quant_group);
}
conv_ref(
@@ -149,13 +172,14 @@ static void Im2colTest() {
conv_p.G * NDim,
Cint32_ref.data() + g * NDim,
Cint8_ref.data() + g * NDim,
- C_multiplier,
+ C_multiplier.data() + g * NDim / ncols_per_quant_group,
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
row_offsets.data(),
col_offsets.data() + g * NDim,
- nullptr);
+ nullptr,
+ ncols_per_quant_group);
}
PackBMatrix<int8_t, ACC_T> packedB(
@@ -171,8 +195,7 @@ static void Im2colTest() {
#pragma omp parallel
#endif
{
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithIm2Col<uint8_t, ACC_T>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, ACC_T> packA(
@@ -183,23 +206,20 @@ static void Im2colTest() {
row_offset_buf.data());
DoNothing<> doNothingObj{};
- ReQuantizeOutput<false> outputProcObj(
+ ReQuantizeOutput<false, Q_GRAN> outputProcObj(
doNothingObj,
- C_multiplier,
+ C_multiplier.data(),
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ Bint8_zero_point.data(),
packA.getRowOffsetBuffer(),
col_offsets.data(),
- nullptr);
+ nullptr,
+ conv_p.G * NDim,
+ conv_p.G);
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
fbgemmPacked(
packA,
@@ -236,12 +256,26 @@ static void Im2colTest() {
} // for each shape
}
-TEST(FBGemmIm2colTest, Acc32Test) {
- Im2colTest<int32_t>();
+TEST_P(fbgemmIm2colTest, Acc32Test) {
+ QuantizationGranularity q_granularity = GetParam();
+ if (q_granularity == QuantizationGranularity::TENSOR) {
+ Im2colTest<int32_t, QuantizationGranularity::TENSOR>();
+ } else if (q_granularity == QuantizationGranularity::GROUP) {
+ Im2colTest<int32_t, QuantizationGranularity::GROUP>();
+ } else {
+ Im2colTest<int32_t, QuantizationGranularity::OUT_CHANNEL>();
+ }
}
-TEST(FBGemmIm2colTest, Acc16Test) {
- Im2colTest<int16_t>();
+TEST_P(fbgemmIm2colTest, Acc16Test) {
+ QuantizationGranularity q_granularity = GetParam();
+ if (q_granularity == QuantizationGranularity::TENSOR) {
+ Im2colTest<int16_t, QuantizationGranularity::TENSOR>();
+ } else if (q_granularity == QuantizationGranularity::GROUP) {
+ Im2colTest<int16_t, QuantizationGranularity::GROUP>();
+ } else {
+ Im2colTest<int16_t, QuantizationGranularity::OUT_CHANNEL>();
+ }
}
static vector<conv_param_t<3>> shapes_3d = {
@@ -319,7 +353,7 @@ static vector<conv_param_t<3>> shapes_3d = {
3>(1, 8, 16, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, 0, 0}),
};
-template <typename ACC_T>
+template <typename ACC_T, QuantizationGranularity Q_GRAN>
static void Im2col3DTest() {
for (auto conv_p : shapes_3d) {
for (int groups : {1, 4}) {
@@ -329,32 +363,39 @@ static void Im2col3DTest() {
conv_p.G = groups;
aligned_vector<uint8_t> Aint8(
conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IN_DIM[2] *
- conv_p.IC,
- 0);
+ conv_p.IC);
aligned_vector<int8_t> Bint8(
- conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC, 0);
+ conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC);
aligned_vector<int32_t> Cint32_ref(
conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] *
- conv_p.OUT_DIM[2] * conv_p.OC,
- 0);
- aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
- aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
- aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
-
- int32_t Aint8_zero_point, Bint8_zero_point;
+ conv_p.OUT_DIM[2] * conv_p.OC);
+ aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+ aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+
+ int ncols_per_quant_group = conv_p.OC;
+ if (Q_GRAN == QuantizationGranularity::GROUP) {
+ ncols_per_quant_group = conv_p.OC / conv_p.G;
+ } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) {
+ ncols_per_quant_group = 1;
+ }
+ int32_t Aint8_zero_point;
+ aligned_vector<int32_t> Bint8_zero_point(
+ conv_p.OC / ncols_per_quant_group);
if (is_same<ACC_T, int32_t>::value) {
- randFill(Aint8, 0, 80);
+ randFill<uint8_t>(Aint8, 0, 80);
Aint8_zero_point = 43;
- randFill(Bint8, -16, 16);
- Bint8_zero_point = -30;
+ randFill<int8_t>(Bint8, -16, 16);
+ randFill(Bint8_zero_point, -50, -10);
} else {
- randFill(Aint8, 0, 5);
+ randFill<uint8_t>(Aint8, 0, 5);
Aint8_zero_point = 4;
- randFill(Bint8, -4, 4);
- Bint8_zero_point = -2;
+ randFill<int8_t>(Bint8, -4, 4);
+ randFill(Bint8_zero_point, -3, -1);
}
- float C_multiplier = 0.1234;
+ aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+ randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
int32_t C_zero_pt = 5;
int MDim =
@@ -369,16 +410,16 @@ static void Im2col3DTest() {
im2col3d_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data());
// computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(groups * NDim);
+ vector<int32_t> col_offsets(groups * NDim);
for (int g = 0; g < groups; ++g) {
col_offsets_with_zero_pt_s8acc32_ref(
KDimPerGroup,
NDim,
NDim,
Bint8.data() + g * KDimPerGroup * NDim,
- Bint8_zero_point,
- col_offsets.data() + g * NDim);
+ Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
+ col_offsets.data() + g * NDim,
+ ncols_per_quant_group);
}
conv3d_ref(
@@ -402,13 +443,14 @@ static void Im2col3DTest() {
conv_p.G * NDim,
Cint32_ref.data() + g * NDim,
Cint8_ref.data() + g * NDim,
- C_multiplier,
+ C_multiplier.data() + g * NDim / ncols_per_quant_group,
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ Bint8_zero_point.data() + g * NDim / ncols_per_quant_group,
row_offsets.data(),
col_offsets.data() + g * NDim,
- nullptr);
+ nullptr,
+ ncols_per_quant_group);
}
PackBMatrix<int8_t, ACC_T> packedB(
@@ -424,8 +466,7 @@ static void Im2col3DTest() {
#pragma omp parallel
#endif
{
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithIm2Col<uint8_t, ACC_T, 3>::rowOffsetBufferSize());
PackAWithIm2Col<uint8_t, ACC_T, 3> packA(
@@ -436,23 +477,20 @@ static void Im2col3DTest() {
row_offset_buf.data());
DoNothing<> doNothingObj{};
- ReQuantizeOutput<false> outputProcObj(
+ ReQuantizeOutput<false, Q_GRAN> outputProcObj(
doNothingObj,
- C_multiplier,
+ C_multiplier.data(),
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ Bint8_zero_point.data(),
packA.getRowOffsetBuffer(),
col_offsets.data(),
- nullptr);
+ nullptr,
+ conv_p.G * NDim,
+ conv_p.G);
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
fbgemmPacked(
packA,
@@ -495,12 +533,24 @@ static void Im2col3DTest() {
} // for each shape
}
-TEST(FBGemmIm2colTest, 3DAcc32Test) {
- Im2col3DTest<int32_t>();
+TEST_P(fbgemmIm2colTest, 3DAcc32Test) {
+ QuantizationGranularity q_granularity = GetParam();
+ if (q_granularity == QuantizationGranularity::TENSOR) {
+ Im2col3DTest<int32_t, QuantizationGranularity::TENSOR>();
+ } else if (q_granularity == QuantizationGranularity::GROUP) {
+ Im2col3DTest<int32_t, QuantizationGranularity::GROUP>();
+ } else {
+ Im2col3DTest<int32_t, QuantizationGranularity::OUT_CHANNEL>();
+ }
}
-TEST(FBGemmIm2colTest, 3DAcc16Test) {
- Im2col3DTest<int16_t>();
+TEST_P(fbgemmIm2colTest, 3DAcc16Test) {
+ QuantizationGranularity q_granularity = GetParam();
+ if (q_granularity == QuantizationGranularity::TENSOR) {
+ Im2col3DTest<int16_t, QuantizationGranularity::TENSOR>();
+ } else if (q_granularity == QuantizationGranularity::GROUP) {
+ Im2col3DTest<int16_t, QuantizationGranularity::GROUP>();
+ } else {
+ Im2col3DTest<int16_t, QuantizationGranularity::OUT_CHANNEL>();
+ }
}
-
-} // namespace fbgemm
diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
index cb614cd..55f6e7f 100644
--- a/test/PackedRequantizeAcc16Test.cc
+++ b/test/PackedRequantizeAcc16Test.cc
@@ -25,17 +25,34 @@
using namespace std;
using namespace fbgemm;
-std::vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
+vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
matrix_op_t::Transpose};
+vector<QuantizationGranularity> qGranularityVals{
+ QuantizationGranularity::TENSOR,
+ QuantizationGranularity::GROUP,
+ QuantizationGranularity::OUT_CHANNEL};
+
namespace {
-class fbgemmu8s8acc16test : public testing::TestWithParam<
- std::tuple<matrix_op_t, matrix_op_t, bool>> {};
+class fbgemmu8s8acc16WithQuantGranularityTest
+ : public testing::TestWithParam<
+ tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {};
+class fbgemmu8s8acc16Test
+ : public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
}; // namespace
INSTANTIATE_TEST_CASE_P(
InstantiationName,
- fbgemmu8s8acc16test,
+ fbgemmu8s8acc16WithQuantGranularityTest,
+ ::testing::Combine(
+ ::testing::Values(matrix_op_t::NoTranspose),
+ ::testing::ValuesIn(transposeVals),
+ ::testing::Bool(),
+ ::testing::ValuesIn(qGranularityVals)));
+
+INSTANTIATE_TEST_CASE_P(
+ InstantiationName,
+ fbgemmu8s8acc16Test,
::testing::Combine(
::testing::Values(matrix_op_t::NoTranspose),
::testing::ValuesIn(transposeVals),
@@ -77,11 +94,12 @@ static vector<vector<int>> GetShapes_() {
* @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit
* accumulation. Output processing: requantization -> nothing
*/
-TEST_P(fbgemmu8s8acc16test, Test) {
+TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, Test) {
vector<vector<int>> shapes(GetShapes_());
matrix_op_t atrans, btrans;
bool test_ld;
- tie(atrans, btrans, test_ld) = GetParam();
+ QuantizationGranularity q_granularity;
+ tie(atrans, btrans, test_ld, q_granularity) = GetParam();
for (auto shape : shapes) {
for (int groups : {1, 3, 4}) {
@@ -93,22 +111,21 @@ TEST_P(fbgemmu8s8acc16test, Test) {
}
int k_per_group = k / groups;
- aligned_vector<uint8_t> Aint8(m * k, 0);
+ aligned_vector<uint8_t> Aint8(m * k);
- aligned_vector<int8_t> Bint8(k * n, 0);
- aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+ aligned_vector<int8_t> Bint8_ref(k * n);
- aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
- aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
- aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
- aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
- aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_ref(m * n * groups);
+ aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+ aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
- randFill(Aint8, 0, 255);
+ randFill<uint8_t>(Aint8, 0, 255);
int32_t Aint8_zero_point = 43;
- randFill(Bint8_ref, -128, 127);
- Bint8 = Bint8_ref;
+ randFill<int8_t>(Bint8_ref, -128, 127);
+ aligned_vector<int8_t> Bint8(Bint8_ref);
if (btrans == matrix_op_t::Transpose) {
aligned_vector<int8_t> Bint8_temp(Bint8.size());
@@ -124,7 +141,6 @@ TEST_P(fbgemmu8s8acc16test, Test) {
Bint8 = Bint8_temp;
}
- int32_t Bint8_zero_point = -30;
// To test lda != k , we just reduce k by half and use the original k
// as lda.
int n_adjusted = n;
@@ -137,23 +153,33 @@ TEST_P(fbgemmu8s8acc16test, Test) {
}
}
+ int ncols_per_quant_group = groups * n_adjusted;
+ if (q_granularity == QuantizationGranularity::GROUP) {
+ ncols_per_quant_group = n_adjusted;
+ } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+ ncols_per_quant_group = 1;
+ }
+ aligned_vector<int32_t> Bint8_zero_point(
+ groups * n_adjusted / ncols_per_quant_group);
+ randFill(Bint8_zero_point, -60, 0);
+
// computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(groups * n_adjusted);
+ vector<int32_t> col_offsets(groups * n_adjusted);
for (int g = 0; g < groups; ++g) {
col_offsets_with_zero_pt_s8acc32_ref(
k_per_group,
n_adjusted,
n,
Bint8_ref.data() + g * k_per_group * n,
- Bint8_zero_point,
- col_offsets.data() + g * n_adjusted);
+ Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+ col_offsets.data() + g * n_adjusted,
+ ncols_per_quant_group);
}
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
+ vector<int32_t> row_offsets(m);
- float C_multiplier = 0.1234;
+ aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+ randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
int32_t C_zero_pt = 5;
int brow = 256;
@@ -183,13 +209,14 @@ TEST_P(fbgemmu8s8acc16test, Test) {
groups * n,
Cint32_ref.data() + g * n_adjusted,
Cint8_ref.data() + g * n_adjusted,
- C_multiplier,
+ C_multiplier.data() + g * n_adjusted / ncols_per_quant_group,
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
row_offsets.data(),
col_offsets.data() + g * n_adjusted,
- nullptr);
+ nullptr,
+ ncols_per_quant_group);
}
PackBMatrix<int8_t, int16_t> packedBN(
@@ -205,8 +232,7 @@ TEST_P(fbgemmu8s8acc16test, Test) {
#pragma omp parallel
#endif
{
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
PackAWithRowOffset<uint8_t, int16_t> packAN(
@@ -219,34 +245,79 @@ TEST_P(fbgemmu8s8acc16test, Test) {
groups,
row_offset_buf.data());
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
+
DoNothing<> doNothingObj{};
- ReQuantizeOutput<false> outputProcObj(
- doNothingObj,
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- packAN.getRowOffsetBuffer(),
- col_offsets.data(),
- nullptr);
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ if (q_granularity == QuantizationGranularity::TENSOR) {
+ ReQuantizeOutput<false> outputProcObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr,
+ groups * n_adjusted,
+ groups);
- fbgemmPacked(
- packAN,
- packedBN,
- Cint8_fb.data(),
- Cint32_buffer.data(),
- groups * n,
- outputProcObj,
- tid,
- num_threads);
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint8_fb.data(),
+ Cint32_buffer.data(),
+ groups * n,
+ outputProcObj,
+ tid,
+ num_threads);
+ } else if (q_granularity == QuantizationGranularity::GROUP) {
+ ReQuantizeOutput<false, QuantizationGranularity::GROUP> outputProcObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr,
+ groups * n_adjusted,
+ groups);
+
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint8_fb.data(),
+ Cint32_buffer.data(),
+ groups * n,
+ outputProcObj,
+ tid,
+ num_threads);
+ } else {
+ ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>
+ outputProcObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr,
+ groups * n_adjusted,
+ groups);
+
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint8_fb.data(),
+ Cint32_buffer.data(),
+ groups * n,
+ outputProcObj,
+ tid,
+ num_threads);
+ }
} // omp parallel
compare_validate_buffers(
@@ -264,11 +335,12 @@ TEST_P(fbgemmu8s8acc16test, Test) {
* @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit
* accumulation. Output processing: spmdm -> requantization -> nothing
*/
-TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
+TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, SpMDMTest) {
vector<vector<int>> shapes(GetShapes_());
matrix_op_t atrans, btrans;
bool test_ld;
- tie(atrans, btrans, test_ld) = GetParam();
+ QuantizationGranularity q_granularity;
+ tie(atrans, btrans, test_ld, q_granularity) = GetParam();
for (auto shape : shapes) {
for (int groups : {1, 3, 4}) {
@@ -283,21 +355,19 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
}
int k_per_group = k / groups;
- aligned_vector<uint8_t> Aint8(m * k, 0);
+ aligned_vector<uint8_t> Aint8(m * k);
+ aligned_vector<int8_t> Bint8(k * n);
- aligned_vector<int8_t> Bint8(k * n, 0);
- aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+ aligned_vector<int32_t> Cint32_ref(m * n * groups);
+ aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+ aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
- aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
- aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
- aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
- aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
- aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
-
- randFill(Aint8, 0, 255);
+ randFill<uint8_t>(Aint8, 0, 255);
int32_t Aint8_zero_point = 43;
- randFill(Bint8, -128, 127);
+ randFill<int8_t>(Bint8, -128, 127);
// To test lda != k , we just reduce k by half and use the original k
// as lda.
@@ -311,18 +381,27 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
}
}
- int32_t Bint8_zero_point = -30;
+ int ncols_per_quant_group = groups * n_adjusted;
+ if (q_granularity == QuantizationGranularity::GROUP) {
+ ncols_per_quant_group = n_adjusted;
+ } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+ ncols_per_quant_group = 1;
+ }
+ aligned_vector<int32_t> Bint8_zero_point(
+ groups * n_adjusted / ncols_per_quant_group);
+ randFill(Bint8_zero_point, -50, -10);
+
// computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(groups * n_adjusted);
+ vector<int32_t> col_offsets(groups * n_adjusted);
for (int g = 0; g < groups; ++g) {
col_offsets_with_zero_pt_s8acc32_ref(
k_per_group,
n_adjusted,
n,
- Bint8_ref.data() + g * k_per_group * n,
- Bint8_zero_point,
- col_offsets.data() + g * n_adjusted);
+ Bint8.data() + g * k_per_group * n,
+ Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+ col_offsets.data() + g * n_adjusted,
+ ncols_per_quant_group);
}
CompressedSparseColumn B_csc(k_per_group, groups * n_adjusted);
@@ -366,7 +445,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
}
B_csc.ColPtr()[groups * n_adjusted] = total_nnz;
- Bint8_ref = Bint8;
+ aligned_vector<int8_t> Bint8_ref(Bint8);
if (btrans == matrix_op_t::Transpose) {
aligned_vector<int8_t> Bint8_temp(Bint8.size());
@@ -382,10 +461,10 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
Bint8 = Bint8_temp;
}
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
+ vector<int32_t> row_offsets(m);
- float C_multiplier = 0.1234;
+ aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+ randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
int32_t C_zero_pt = 5;
int brow = 256;
@@ -428,13 +507,14 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
groups * n,
Cint32_ref.data() + g * n_adjusted,
Cint8_ref.data() + g * n_adjusted,
- C_multiplier,
+ C_multiplier.data() + g * n_adjusted / ncols_per_quant_group,
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
row_offsets.data(),
col_offsets.data() + g * n_adjusted,
- nullptr);
+ nullptr,
+ ncols_per_quant_group);
}
PackBMatrix<int8_t, int16_t> packedB(
@@ -450,8 +530,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
#pragma omp parallel
#endif
{
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
PackAWithRowOffset<uint8_t, int16_t> packAN(
@@ -464,51 +543,106 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
groups,
row_offset_buf.data());
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
+
// spmdm -> requantization -> nothing
// construct an output processing pipeline in reverse order
// i.e. last output operation first
// Last operation should always be DoNothing with
// correct input and output type.
DoNothing<> doNothingObj{};
- // The second last operation is requantization back
- // to int8
- ReQuantizeOutput<false> reqObj(
- doNothingObj,
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- packAN.getRowOffsetBuffer(),
- col_offsets.data(),
- nullptr);
- // the top most (first) operation in the output processing
- // pipeline is spmdm
- // outType = final output type after fullly processing through
- // pipeline inType = initial input type at the first call to the whole
- // pipeline
- DoSpmdmOnInpBuffer<
- ReQuantizeOutput<false>::outType,
- int32_t,
- ReQuantizeOutput<false>>
- spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
-
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
- fbgemmPacked(
- packAN,
- packedB,
- Cint8_fb.data(),
- Cint32_fb.data(),
- groups * n,
- spmdmObj,
- tid,
- num_threads);
+ if (q_granularity == QuantizationGranularity::TENSOR) {
+ // The second last operation is requantization back
+ // to int8
+ ReQuantizeOutput<false> reqObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr,
+ groups * n_adjusted,
+ groups);
+ // the top most (first) operation in the output processing
+ // pipeline is spmdm
+ // outType = final output type after fullly processing through
+ // pipeline inType = initial input type at the first call to the
+ // whole pipeline
+ DoSpmdmOnInpBuffer<
+ ReQuantizeOutput<false>::outType,
+ int32_t,
+ ReQuantizeOutput<false>>
+ spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+ fbgemmPacked(
+ packAN,
+ packedB,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ groups * n,
+ spmdmObj,
+ tid,
+ num_threads);
+ } else if (q_granularity == QuantizationGranularity::GROUP) {
+ ReQuantizeOutput<false, QuantizationGranularity::GROUP> reqObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr,
+ groups * n_adjusted,
+ groups);
+ DoSpmdmOnInpBuffer<
+ ReQuantizeOutput<false>::outType,
+ int32_t,
+ ReQuantizeOutput<false, QuantizationGranularity::GROUP>>
+ spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+ fbgemmPacked(
+ packAN,
+ packedB,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ groups * n,
+ spmdmObj,
+ tid,
+ num_threads);
+ } else {
+ ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>
+ reqObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr,
+ groups * n_adjusted,
+ groups);
+ DoSpmdmOnInpBuffer<
+ ReQuantizeOutput<false>::outType,
+ int32_t,
+ ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>>
+ spmdmObj(reqObj, Aint8.data(), k, B_csc, groups);
+
+ fbgemmPacked(
+ packAN,
+ packedB,
+ Cint8_fb.data(),
+ Cint32_fb.data(),
+ groups * n,
+ spmdmObj,
+ tid,
+ num_threads);
+ }
}
compare_validate_buffers(
@@ -527,7 +661,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
* @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit
* accumulation. Output processing: nothing
*/
-TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
+TEST_P(fbgemmu8s8acc16Test, NoRequantizeTest) {
vector<vector<int>> shapes(GetShapes_());
matrix_op_t atrans, btrans;
bool test_ld;
@@ -543,20 +677,19 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
}
int k_per_group = k / groups;
- aligned_vector<uint8_t> Aint8(m * k, 0);
+ aligned_vector<uint8_t> Aint8(m * k);
- aligned_vector<int8_t> Bint8(k * n, 0);
- aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+ aligned_vector<int8_t> Bint8_ref(k * n);
- aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
- aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
- aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_ref(m * n * groups);
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
- randFill(Aint8, 0, 255);
+ randFill<uint8_t>(Aint8, 0, 255);
int32_t Aint8_zero_point = 43;
- randFill(Bint8_ref, -128, 127);
- Bint8 = Bint8_ref;
+ randFill<int8_t>(Bint8_ref, -128, 127);
+ aligned_vector<int8_t> Bint8(Bint8_ref);
if (btrans == matrix_op_t::Transpose) {
aligned_vector<int8_t> Bint8_temp(Bint8.size());
@@ -586,20 +719,19 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
}
// computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(groups * n_adjusted);
+ vector<int32_t> col_offsets(groups * n_adjusted);
for (int g = 0; g < groups; ++g) {
col_offsets_with_zero_pt_s8acc32_ref(
k_per_group,
n_adjusted,
n,
Bint8_ref.data() + g * k_per_group * n,
- Bint8_zero_point,
- col_offsets.data() + g * n_adjusted);
+ &Bint8_zero_point,
+ col_offsets.data() + g * n_adjusted,
+ n_adjusted);
}
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
+ vector<int32_t> row_offsets(m);
int brow = 256;
for (int g = 0; g < groups; ++g) {
@@ -636,8 +768,7 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
#pragma omp parallel
#endif
{
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize());
PackAWithRowOffset<uint8_t, int16_t> packAN(
@@ -654,13 +785,8 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
DoNothing<int32_t, int32_t> doNothingObj{};
memCopy<> outputProcObj(doNothingObj);
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
fbgemmPacked(
packAN,
diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
index a5744c0..9873e3f 100644
--- a/test/PackedRequantizeTest.cc
+++ b/test/PackedRequantizeTest.cc
@@ -25,17 +25,35 @@
using namespace std;
using namespace fbgemm;
-std::vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
- matrix_op_t::Transpose};
+vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
+ matrix_op_t::Transpose};
+
+vector<QuantizationGranularity> qGranularityVals{
+ QuantizationGranularity::TENSOR,
+ QuantizationGranularity::GROUP,
+ QuantizationGranularity::OUT_CHANNEL};
namespace {
-class fbgemmu8s8acc32test : public testing::TestWithParam<
- std::tuple<matrix_op_t, matrix_op_t, bool>> {};
+class fbgemmu8s8acc32WithQuantGranularityTest
+ : public testing::TestWithParam<
+ tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {};
+class fbgemmu8s8acc32Test
+ : public testing::TestWithParam<
+ tuple<matrix_op_t, matrix_op_t, bool>> {};
}; // namespace
INSTANTIATE_TEST_CASE_P(
InstantiationName,
- fbgemmu8s8acc32test,
+ fbgemmu8s8acc32WithQuantGranularityTest,
+ ::testing::Combine(
+ ::testing::Values(matrix_op_t::NoTranspose),
+ ::testing::ValuesIn(transposeVals),
+ ::testing::Bool(),
+ ::testing::ValuesIn(qGranularityVals)));
+
+INSTANTIATE_TEST_CASE_P(
+ InstantiationName,
+ fbgemmu8s8acc32Test,
::testing::Combine(
::testing::Values(matrix_op_t::NoTranspose),
::testing::ValuesIn(transposeVals),
@@ -77,11 +95,12 @@ static vector<vector<int>> GetShapes_() {
* @brief Unit test for uint8 matrix A, int8 matrix B, and 32-bit
* accumulation. Output processing: requantization -> nothing
*/
-TEST_P(fbgemmu8s8acc32test, Test) {
+TEST_P(fbgemmu8s8acc32WithQuantGranularityTest, Test) {
vector<vector<int>> shapes(GetShapes_());
matrix_op_t atrans, btrans;
bool test_ld;
- tie(atrans, btrans, test_ld) = GetParam();
+ QuantizationGranularity q_granularity;
+ tie(atrans, btrans, test_ld, q_granularity) = GetParam();
for (auto shape : shapes) {
for (int groups : {1, 3, 4}) {
@@ -95,22 +114,21 @@ TEST_P(fbgemmu8s8acc32test, Test) {
int k_per_group = k / groups;
// mxk matrix
- aligned_vector<uint8_t> Aint8(m * k, 0);
+ aligned_vector<uint8_t> Aint8(m * k);
// kxn matrix
- aligned_vector<int8_t> Bint8(k * n, 0);
- aligned_vector<int8_t> Bint8_ref(Bint8.size(), 0);
+ aligned_vector<int8_t> Bint8_ref(k * n);
- aligned_vector<int32_t> Cint32_ref(m * n * groups, 0);
- aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size(), 0);
- aligned_vector<int32_t> Cint32_fb(Cint32_ref.size(), 0);
- aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size(), 0);
- aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size(), 0);
+ aligned_vector<int32_t> Cint32_ref(m * n * groups);
+ aligned_vector<uint8_t> Cint8_ref(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_fb(Cint32_ref.size());
+ aligned_vector<uint8_t> Cint8_fb(Cint32_ref.size());
+ aligned_vector<int32_t> Cint32_buffer(Cint32_ref.size());
- randFill(Aint8, 0, 255);
+ randFill<uint8_t>(Aint8, 0, 255);
int32_t Aint8_zero_point = 43;
- randFill(Bint8_ref, -128, 127);
+ randFill<int8_t>(Bint8_ref, -128, 127);
for (int g = 0; g < groups; ++g) {
avoidOverflow(
m,
@@ -122,7 +140,7 @@ TEST_P(fbgemmu8s8acc32test, Test) {
n);
}
- Bint8 = Bint8_ref;
+ aligned_vector<int8_t> Bint8(Bint8_ref);
// initialize bias
aligned_vector<int32_t> bias_int32(groups * n);
@@ -146,7 +164,6 @@ TEST_P(fbgemmu8s8acc32test, Test) {
Bint8 = Bint8_temp;
}
- int32_t Bint8_zero_point = -30;
// To test lda != k , we just reduce k by half and use the original k
// as lda.
int n_adjusted = n;
@@ -159,23 +176,33 @@ TEST_P(fbgemmu8s8acc32test, Test) {
}
}
+ int ncols_per_quant_group = groups * n_adjusted;
+ if (q_granularity == QuantizationGranularity::GROUP) {
+ ncols_per_quant_group = n_adjusted;
+ } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+ ncols_per_quant_group = 1;
+ }
+ aligned_vector<int32_t> Bint8_zero_point(
+ groups * n_adjusted / ncols_per_quant_group);
+ randFill(Bint8_zero_point, -50, -10);
+
// computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(groups * n_adjusted);
+ vector<int32_t> col_offsets(groups * n_adjusted);
for (int g = 0; g < groups; ++g) {
col_offsets_with_zero_pt_s8acc32_ref(
k_per_group,
n_adjusted,
n,
Bint8_ref.data() + g * k_per_group * n,
- Bint8_zero_point,
- col_offsets.data() + g * n_adjusted);
+ Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+ col_offsets.data() + g * n_adjusted,
+ ncols_per_quant_group);
}
- vector<int32_t> row_offsets;
- row_offsets.resize(m);
+ vector<int32_t> row_offsets(m);
- float C_multiplier = 0.001234;
+ aligned_vector<float> C_multiplier(Bint8_zero_point.size());
+ randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2);
int32_t C_zero_pt = 5;
for (int g = 0; g < groups; ++g) {
@@ -203,13 +230,14 @@ TEST_P(fbgemmu8s8acc32test, Test) {
groups * n,
Cint32_ref.data() + g * n_adjusted,
Cint8_ref.data() + g * n_adjusted,
- C_multiplier,
+ C_multiplier.data() + g * n_adjusted / ncols_per_quant_group,
C_zero_pt,
Aint8_zero_point,
- Bint8_zero_point,
+ Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
row_offsets.data(),
col_offsets.data() + g * n_adjusted,
- bias ? (bias + g * n_adjusted) : nullptr);
+ bias ? (bias + g * n_adjusted) : nullptr,
+ ncols_per_quant_group);
}
PackBMatrix<int8_t> packedBN(
@@ -225,8 +253,7 @@ TEST_P(fbgemmu8s8acc32test, Test) {
#pragma omp parallel
#endif
{
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
PackAWithRowOffset<uint8_t> packAN(
@@ -239,34 +266,80 @@ TEST_P(fbgemmu8s8acc32test, Test) {
groups,
row_offset_buf.data());
- DoNothing<> doNothingObj{};
- ReQuantizeOutput<false> outputProcObj(
- doNothingObj,
- C_multiplier,
- C_zero_pt,
- Aint8_zero_point,
- Bint8_zero_point,
- packAN.getRowOffsetBuffer(),
- col_offsets.data(),
- bias);
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ DoNothing<> doNothingObj{};
- fbgemmPacked(
- packAN,
- packedBN,
- Cint8_fb.data(),
- Cint32_buffer.data(),
- groups * n,
- outputProcObj,
- tid,
- num_threads);
+ if (q_granularity == QuantizationGranularity::TENSOR) {
+ ReQuantizeOutput<false> outputProcObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ bias,
+ groups * n_adjusted,
+ groups);
+
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint8_fb.data(),
+ Cint32_buffer.data(),
+ groups * n,
+ outputProcObj,
+ tid,
+ num_threads);
+ } else if (q_granularity == QuantizationGranularity::GROUP) {
+ ReQuantizeOutput<false, QuantizationGranularity::GROUP>
+ outputProcObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ bias,
+ groups * n_adjusted,
+ groups);
+
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint8_fb.data(),
+ Cint32_buffer.data(),
+ groups * n,
+ outputProcObj,
+ tid,
+ num_threads);
+ } else {
+ ReQuantizeOutput<false, QuantizationGranularity::OUT_CHANNEL>
+ outputProcObj(
+ doNothingObj,
+ C_multiplier.data(),
+ C_zero_pt,
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ bias,
+ groups * n_adjusted,
+ groups);
+
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cint8_fb.data(),
+ Cint32_buffer.data(),
+ groups * n,
+ outputProcObj,
+ tid,
+ num_threads);
+ }
}
// printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(),
// m, n_adjusted, n, "C local");
@@ -287,11 +360,12 @@ TEST_P(fbgemmu8s8acc32test, Test) {
* accumulation. Directly output fp32 matrix C. Output processing:
* requantization -> nothing
*/
-TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
+TEST_P(fbgemmu8s8acc32WithQuantGranularityTest, TestFloatInputOutput) {
vector<vector<int>> shapes(GetShapes_());
matrix_op_t atrans, btrans;
bool test_ld;
- tie(atrans, btrans, test_ld) = GetParam();
+ QuantizationGranularity q_granularity;
+ tie(atrans, btrans, test_ld, q_granularity) = GetParam();
for (auto shape : shapes) {
for (int groups : {1, 3, 4}) {
@@ -303,26 +377,26 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
}
int k_per_group = k / groups;
- aligned_vector<float> Afp32(m * k, 0.0f);
- aligned_vector<uint8_t> Aint8(Afp32.size(), 0);
+ aligned_vector<float> Afp32(m * k);
+ aligned_vector<uint8_t> Aint8(Afp32.size());
- aligned_vector<float> Bfp32(k * n, 0.0f);
- aligned_vector<int8_t> Bint8(Bfp32.size(), 0);
+ aligned_vector<float> Bfp32(k * n);
+ aligned_vector<int8_t> Bint8(Bfp32.size());
- aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f);
- aligned_vector<float> Cfp32_fb(Cfp32_ref.size(), 0.0f);
+ aligned_vector<float> Cfp32_ref(m * n * groups);
+ aligned_vector<float> Cfp32_fb(Cfp32_ref.size());
- aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size(), 0);
- aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size(), 0);
+ aligned_vector<uint8_t> Cint8_fb(Cfp32_ref.size());
+ aligned_vector<int32_t> Cint32_buffer(Cfp32_ref.size());
- randFill(Aint8, 0, 255);
+ randFill<uint8_t>(Aint8, 0, 255);
int32_t Aint8_zero_point = 43;
float Aint8_scale = 0.11;
for (auto i = 0; i < Afp32.size(); ++i) {
Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point);
}
- randFill(Bint8, -128, 127);
+ randFill<int8_t>(Bint8, -128, 127);
for (int g = 0; g < groups; ++g) {
avoidOverflow(
m,
@@ -333,11 +407,6 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
Bint8.data() + g * k_per_group * n,
n);
}
- int32_t Bint8_zero_point = -30;
- float Bint8_scale = 0.49;
- for (auto i = 0; i < Bfp32.size(); ++i) {
- Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point);
- }
// To test lda != k , we just reduce k by half and use the original k
// as lda.
@@ -351,17 +420,37 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
}
}
+ int ncols_per_quant_group = groups * n_adjusted;
+ if (q_granularity == QuantizationGranularity::GROUP) {
+ ncols_per_quant_group = n_adjusted;
+ } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) {
+ ncols_per_quant_group = 1;
+ }
+ aligned_vector<int32_t> Bint8_zero_point(
+ groups * n_adjusted / ncols_per_quant_group);
+ randFill(Bint8_zero_point, -50, -10);
+ aligned_vector<float> Bint8_scale(Bint8_zero_point.size());
+ randFill(Bint8_scale, 0.49f / 2, 0.49f * 3 / 2);
+ for (int i = 0; i < k; ++i) {
+ int g = i / k_per_group;
+ for (int j = 0; j < n_adjusted; ++j) {
+ int quant_group = (g * n_adjusted + j) / ncols_per_quant_group;
+ Bfp32[i * n + j] = Bint8_scale[quant_group] *
+ (Bint8[i * n + j] - Bint8_zero_point[quant_group]);
+ }
+ }
+
// computing column offset
- vector<int32_t> col_offsets;
- col_offsets.resize(groups * n_adjusted);
+ vector<int32_t> col_offsets(groups * n_adjusted);
for (int g = 0; g < groups; ++g) {
col_offsets_with_zero_pt_s8acc32_ref(
k_per_group,
n_adjusted,
n,
Bint8.data() + g * k_per_group * n,
- Bint8_zero_point,
- col_offsets.data() + g * n_adjusted);
+ Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group,
+ col_offsets.data() + g * n_adjusted,
+ ncols_per_quant_group);
}
if (btrans == matrix_op_t::Transpose) {
@@ -404,8 +493,7 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
#pragma omp parallel
#endif
{
- vector<int32_t> row_offset_buf;
- row_offset_buf.resize(
+ vector<int32_t> row_offset_buf(
PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
PackAWithQuantRowOffset<uint8_t> packAN(
@@ -420,47 +508,96 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
groups,
row_offset_buf.data());
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
+
DoNothing<float, float> doNothingObj{};
- ReQuantizeForFloat<false> outputProcObj(
- doNothingObj,
- Aint8_scale,
- Bint8_scale,
- Aint8_zero_point,
- Bint8_zero_point,
- packAN.getRowOffsetBuffer(),
- col_offsets.data(),
- nullptr);
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ if (q_granularity == QuantizationGranularity::TENSOR) {
+ ReQuantizeForFloat<false> outputProcObj(
+ doNothingObj,
+ Aint8_scale,
+ Bint8_scale.data(),
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr,
+ groups * n_adjusted,
+ groups);
- fbgemmPacked(
- packAN,
- packedBN,
- Cfp32_fb.data(),
- reinterpret_cast<int32_t*>(Cfp32_fb.data()),
- groups * n,
- outputProcObj,
- tid,
- num_threads);
- }
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cfp32_fb.data(),
+ reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+ groups * n,
+ outputProcObj,
+ tid,
+ num_threads);
+ } else if (q_granularity == QuantizationGranularity::GROUP) {
+ ReQuantizeForFloat<false, QuantizationGranularity::GROUP>
+ outputProcObj(
+ doNothingObj,
+ Aint8_scale,
+ Bint8_scale.data(),
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr,
+ groups * n_adjusted,
+ groups);
- float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end());
- float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end());
- float atol = (maximum - minimum) / 255 / 1.9;
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cfp32_fb.data(),
+ reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+ groups * n,
+ outputProcObj,
+ tid,
+ num_threads);
+ } else {
+ ReQuantizeForFloat<false, QuantizationGranularity::OUT_CHANNEL>
+ outputProcObj(
+ doNothingObj,
+ Aint8_scale,
+ Bint8_scale.data(),
+ Aint8_zero_point,
+ Bint8_zero_point.data(),
+ packAN.getRowOffsetBuffer(),
+ col_offsets.data(),
+ nullptr,
+ groups * n_adjusted,
+ groups);
+ fbgemmPacked(
+ packAN,
+ packedBN,
+ Cfp32_fb.data(),
+ reinterpret_cast<int32_t*>(Cfp32_fb.data()),
+ groups * n,
+ outputProcObj,
+ tid,
+ num_threads);
+ }
+ }
+
+ float maximum = 0;
+ for (int i = 0; i < m; ++i) {
+ for (int j = 0; j < groups * n_adjusted; ++j) {
+ float c = Cfp32_ref[i * groups * n + j];
+ maximum = std::max(maximum, std::abs(c));
+ }
+ }
compare_validate_buffers(
Cfp32_ref.data(),
Cfp32_fb.data(),
m,
groups * n_adjusted,
groups * n,
- atol);
+ maximum * 1e-5f);
} // for each groups
} // for each shape
}
@@ -470,7 +607,7 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
* accumulation. Output processing: requantization -> nothing. Symmetric: the
* zero point is 0.
*/
-TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
+TEST_P(fbgemmu8s8acc32Test, TestSymmetricQuantizedInputOutput) {
vector<vector<int>> shapes(GetShapes_());
matrix_op_t atrans, btrans;
bool test_ld;
@@ -486,22 +623,17 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
}
int k_per_group = k / groups;
- aligned_vector<float> Afp32(m * k, 0.0f);
- aligned_vector<uint8_t> Aint8(Afp32.size(), 0);
+ aligned_vector<uint8_t> Aint8(m * k);
+ aligned_vector<int8_t> Bint8(k * n);
- aligned_vector<float> Bfp32(k * n, 0.0f);
- aligned_vector<int8_t> Bint8(Bfp32.size(), 0);
+ aligned_vector<float> Cfp32_ref(m * n * groups);
+ aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size());
- aligned_vector<float> Cfp32_ref(m * n * groups, 0.0f);
- aligned_vector<int32_t> Cint32_fb(Cfp32_ref.size(), 0);
-
- randFill(Afp32, 0, 255);
- for (auto i = 0; i < Afp32.size(); i++) {
- Aint8[i] = (uint8_t)Afp32[i];
- }
+ randFill<uint8_t>(Aint8, 0, 255);
+ aligned_vector<float> Afp32(Aint8.begin(), Aint8.end());
// initialize B matrix
- randFill(Bfp32, -128, 127);
+ randFill<int8_t>(Bint8, -128, 127);
for (int g = 0; g < groups; ++g) {
avoidOverflow(
m,
@@ -509,13 +641,11 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
k_per_group,
Aint8.data() + g * k_per_group,
k,
- Bfp32.data() + g * k_per_group * n,
+ Bint8.data() + g * k_per_group * n,
n);
}
- for (auto i = 0; i < Bfp32.size(); ++i) {
- Bint8[i] = (int8_t)Bfp32[i];
- }
+ aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
// To test lda != k , we just reduce k by half and use the original k
// as lda.
@@ -577,13 +707,8 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) {
DoNothing<int32_t, int32_t> doNothingObj{};
memCopy<> outputProcObj(doNothingObj);
-#ifdef _OPENMP
- int num_threads = omp_get_num_threads();
- int tid = omp_get_thread_num();
-#else
- int num_threads = 1;
- int tid = 0;
-#endif
+ int num_threads = fbgemm_get_num_threads();
+ int tid = fbgemm_get_thread_num();
fbgemmPacked(
packAN,