12 files changed, 122 insertions, 69 deletions
diff --git a/bench/GEMMsTunableBenchmark.cc b/bench/GEMMsTunableBenchmark.cc
index 04eed8a..a65b51f 100644
--- a/bench/GEMMsTunableBenchmark.cc
+++ b/bench/GEMMsTunableBenchmark.cc
@@ -291,6 +291,8 @@ int main(int /* unused */, char** /* unused */) {
       assert(0 && "architecture not supported");
       return 0;
     }
+  } else {
+    throw std::runtime_error("Failed to initialize cpuinfo!");
   }
 
   set<vector<int>> incorrect_configs;
diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc
index 4175d65..f7292fd 100644
--- a/src/ExecuteKernelU8S8.cc
+++ b/src/ExecuteKernelU8S8.cc
@@ -45,11 +45,19 @@ ExecuteKernel<
       outputProcess_(outputProcess),
       thread_id_(thread_id),
       num_threads_(num_threads) {
+  if (!cpuinfo_initialize()) {
+    throw std::runtime_error("Failed to initialize cpuinfo!");
+  }
   if (params) {
-    mbSize_ = params->MCB;
-    nbSize_ = params->NCB;
-    nrMinSize_ = params->NR_MIN;
-    nrSize_ = params->NR;
+    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
+      mbSize_ = params->MCB;
+      nbSize_ = params->NCB;
+      nrMinSize_ = params->NR_MIN;
+      nrSize_ = params->NR;
+    } else {
+      // TODO: Have default slower path
+      assert(0 && "unsupported architecure");
+    }
   } else {
     if (fbgemmHasAvx512Support()) {
       mbSize_ = PackingTraits<
@@ -110,28 +118,24 @@ void ExecuteKernel<
 
   typename BaseType::jit_micro_kernel_fp fn;
 
-  if (cpuinfo_initialize()) {
-    if (fbgemmHasAvx512Support()) {
-      fn = BaseType::template getOrCreate<inst_set_t::avx512>(
-          accum,
-          packed_rows_A,
-          packedB_.blockColSize(),
-          packedA_.numPackedCols(),
-          nbSize_);
-    } else if (fbgemmHasAvx2Support()) {
-      fn = BaseType::template getOrCreate<inst_set_t::avx2>(
-          accum,
-          packed_rows_A,
-          packedB_.blockColSize(),
-          packedA_.numPackedCols(),
-          nbSize_);
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecture");
-      return;
-    }
+  if (fbgemmHasAvx512Support()) {
+    fn = BaseType::template getOrCreate<inst_set_t::avx512>(
+        accum,
+        packed_rows_A,
+        packedB_.blockColSize(),
+        packedA_.numPackedCols(),
+        nbSize_);
+  } else if (fbgemmHasAvx2Support()) {
+    fn = BaseType::template getOrCreate<inst_set_t::avx2>(
+        accum,
+        packed_rows_A,
+        packedB_.blockColSize(),
+        packedA_.numPackedCols(),
+        nbSize_);
   } else {
-    throw std::runtime_error("Failed to initialize cpuinfo!");
+    // TODO: Have default slower path
+    assert(0 && "unsupported architecture");
+    return;
   }
 
 #ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
@@ -144,20 +148,16 @@ void ExecuteKernel<
     if (jb == bColBlocks - 1) {
       int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_;
       if (nc != nbSize_) {
-        if (cpuinfo_initialize()) {
-          if (fbgemmHasAvx512Support()) {
-            fn = BaseType::template getOrCreate<inst_set_t::avx512>(
-                accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
-          } else if (fbgemmHasAvx2Support()) {
-            fn = BaseType::template getOrCreate<inst_set_t::avx2>(
-                accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
-          } else {
-            // TODO: Have default slower path
-            assert(0 && "unsupported architecture");
-            return;
-          }
+        if (fbgemmHasAvx512Support()) {
+          fn = BaseType::template getOrCreate<inst_set_t::avx512>(
+              accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
+        } else if (fbgemmHasAvx2Support()) {
+          fn = BaseType::template getOrCreate<inst_set_t::avx2>(
+              accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
         } else {
-          throw std::runtime_error("Failed to initialize cpuinfo!");
+          // TODO: Have default slower path
+          assert(0 && "unsupported architecture");
+          return;
         }
       }
     }
diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc
index a40f38a..523f556 100644
--- a/src/Fbgemm.cc
+++ b/src/Fbgemm.cc
@@ -50,9 +50,11 @@ void fbgemmPacked(
   // Run time CPU detection
   if (cpuinfo_initialize()) {
     if (blocking_params) {
-      MCB = blocking_params->MCB;
-      KCB = blocking_params->KCB;
-      MR = blocking_params->MR;
+      if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
+        MCB = blocking_params->MCB;
+        KCB = blocking_params->KCB;
+        MR = blocking_params->MR;
+      }
     } else {
       if (fbgemmHasAvx512Support()) {
         MCB = PackingTraits<
diff --git a/src/GroupwiseConvAcc32Avx2.cc b/src/GroupwiseConvAcc32Avx2.cc
index 0e1b85b..0032e72 100644
--- a/src/GroupwiseConvAcc32Avx2.cc
+++ b/src/GroupwiseConvAcc32Avx2.cc
@@ -1789,6 +1789,9 @@ void fbgemmGroupwiseConv(
     int num_threads) {
   typedef ReQuantizeOutput<FUSE_RELU, Q_GRAN> processOutputType;
 
+  if (!cpuinfo_initialize()) {
+    throw std::runtime_error("Failed to initialize cpuinfo!");
+  }
   if (!fbgemmOptimizedGConv<SPATIAL_DIM>(conv_param) ||
       (!fbgemmHasAvx512Support() && !fbgemmHasAvx2Support())) {
     return fbgemmGroupwiseConvBase_<
diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc
index 89ec13e..87adaba 100644
--- a/src/PackAMatrix.cc
+++ b/src/PackAMatrix.cc
@@ -31,10 +31,18 @@ PackAMatrix<T, accT>::PackAMatrix(
       trans_(trans),
       smat_(smat),
       ld_(ld) {
+  if (!cpuinfo_initialize()) {
+    throw std::runtime_error("Failed to initialize cpuinfo!");
+  }
   if (params) {
+    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
       BaseType::brow_ = params->MCB;
       BaseType::bcol_ = params->KCB;
       row_interleave_B_ = params->ROW_INTERLEAVE;
+    } else {
+      // TODO: Have default slower path
+      assert(0 && "unsupported architecure");
+    }
   } else {
     if (fbgemmHasAvx512Support()) {
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
index fb4556c..e55dd4e 100644
--- a/src/PackAWithIm2Col.cc
+++ b/src/PackAWithIm2Col.cc
@@ -46,11 +46,19 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
       a_zero_pt_(a_zero_pt) {
   static_assert(
       SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension ");
+  if (!cpuinfo_initialize()) {
+    throw std::runtime_error("Failed to initialize cpuinfo!");
+  }
 
   if (params) {
+    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
       BaseType::brow_ = params->MCB;
       BaseType::bcol_ = params->KCB;
       row_interleave_B_ = params->ROW_INTERLEAVE;
+    } else {
+      // TODO: Have default slower path
+      assert(0 && "unsupported architecure");
+    }
   } else {
     if (fbgemmHasAvx512Support()) {
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
@@ -470,19 +478,19 @@ template <typename T, typename accT, int SPATIAL_DIM>
 int PackAWithIm2Col<T, accT, SPATIAL_DIM>::rowOffsetBufferSize(
     const BlockingFactors* params) {
   if (cpuinfo_initialize()) {
-    if (params){
+    if (params) {
       return params->MCB;
     } else {
-    if (fbgemmHasAvx512Support()) {
-          return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
-    } else if (fbgemmHasAvx2Support()) {
+      if (fbgemmHasAvx512Support()) {
+        return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
+      } else if (fbgemmHasAvx2Support()) {
         return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
       } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecture");
-      return -1;
+        // TODO: Have default slower path
+        assert(0 && "unsupported architecture");
+        return -1;
+      }
     }
-  }
   } else {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc
index 175425f..7572a51 100644
--- a/src/PackAWithQuantRowOffset.cc
+++ b/src/PackAWithQuantRowOffset.cc
@@ -42,6 +42,9 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
       scale_(scale),
       zero_pt_(zero_pt),
       row_offset_(row_offset) {
+  if (!cpuinfo_initialize()) {
+    throw std::runtime_error("Failed to initialize cpuinfo!");
+  }
   rowOffsetAllocatedHere = false;
   if (params) {
     if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
diff --git a/src/PackAWithRowOffset.cc b/src/PackAWithRowOffset.cc
index 139a6d3..b791817 100644
--- a/src/PackAWithRowOffset.cc
+++ b/src/PackAWithRowOffset.cc
@@ -36,11 +36,19 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
       smat_(smat),
       ld_(ld),
       row_offset_(row_offset) {
+  if (!cpuinfo_initialize()) {
+    throw std::runtime_error("Failed to initialize cpuinfo!");
+  }
   rowOffsetAllocatedHere = false;
   if (params) {
+    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
       BaseType::brow_ = params->MCB;
       BaseType::bcol_ = params->KCB;
       row_interleave_B_ = params->ROW_INTERLEAVE;
+    } else {
+      // TODO: Have default slower path
+      assert(0 && "unsupported architecure");
+    }
   } else {
     if (fbgemmHasAvx512Support()) {
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::MCB;
@@ -179,19 +187,19 @@ template <typename T, typename accT>
 int PackAWithRowOffset<T, accT>::rowOffsetBufferSize(
     const BlockingFactors* params) {
   if (cpuinfo_initialize()) {
-    if (params){
+    if (params) {
       return params->MCB;
     } else {
-    if (fbgemmHasAvx512Support()) {
+      if (fbgemmHasAvx512Support()) {
         return PackingTraits<T, accT, inst_set_t::avx512>::MCB;
       } else if (fbgemmHasAvx2Support()) {
         return PackingTraits<T, accT, inst_set_t::avx2>::MCB;
-    } else {
-      // TODO: Have default slower path
-      assert(0 && "unsupported architecture");
-      return -1;
+      } else {
+        // TODO: Have default slower path
+        assert(0 && "unsupported architecture");
+        return -1;
+      }
     }
-  }
   } else {
     throw std::runtime_error("Failed to initialize cpuinfo!");
   }
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index 472c802..970a741 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -185,10 +185,18 @@ PackBMatrix<T, accT>::PackBMatrix(
       trans_(trans),
       smat_(smat),
       ld_(ld) {
+  if (!cpuinfo_initialize()) {
+    throw std::runtime_error("Failed to initialize cpuinfo!");
+  }
   if (params) {
+    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
       BaseType::brow_ = params->KCB;
       BaseType::bcol_ = params->NCB;
       row_interleave_ = params->ROW_INTERLEAVE;
+    } else {
+      // TODO: Have default slower path
+      assert(0 && "unsupported architecure");
+    }
   } else {
     if (fbgemmHasAvx512Support()) {
       BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc
index e93b97c..c9a68a6 100644
--- a/src/PackMatrix.cc
+++ b/src/PackMatrix.cc
@@ -33,11 +33,19 @@ int PackMatrix<PT, inpType, accType>::packedBufferSize(
     int rows,
     int cols,
     const BlockingFactors* params) {
+  if (!cpuinfo_initialize()) {
+    throw std::runtime_error("Failed to initialize cpuinfo!");
+  }
   int MCB, KCB, NCB;
   if (params) {
-    MCB = params->MCB;
-    NCB = params->NCB;
-    KCB = params->KCB;
+    if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
+      MCB = params->MCB;
+      NCB = params->NCB;
+      KCB = params->KCB;
+    } else {
+      // TODO: Have default slower path
+      assert(0 && "unsupported architecure");
+    }
   } else {
     if (fbgemmHasAvx512Support()) {
       MCB = PackingTraits<inpType, accType, inst_set_t::avx512>::MCB;
diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc
index 82f8c2a..1ab00d1 100644
--- a/src/QuantUtils.cc
+++ b/src/QuantUtils.cc
@@ -176,7 +176,7 @@ void Quantize<uint8_t>(
     uint8_t* dst,
     int len,
     const TensorQuantizationParams& qparams) {
-  bool avx2_support = fbgemmHasAvx2Support();
+  bool avx2_support = cpuinfo_initialize() && fbgemmHasAvx2Support();
   bool fma_support = cpuinfo_has_x86_fma3();
   if (avx2_support && fma_support && qparams.precision == 8) {
     // fast path
@@ -221,7 +221,8 @@ void Requantize<uint8_t>(
     uint8_t* dst,
     const int len,
     const RequantizationParams& params) {
-  if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) {
+  if (params.target_qparams.precision == 8 && cpuinfo_initialize() &&
+      fbgemmHasAvx2Support()) {
     RequantizeAvx2(src, dst, len, params);
   } else {
     for (int i = 0; i < len; ++i) {
@@ -237,7 +238,7 @@ void RequantizeFixedPoint(
     int len,
     const RequantizationParams& params) {
   if (std::is_same<T, uint8_t>::value && params.target_qparams.precision == 8 &&
-      fbgemmHasAvx2Support()) {
+      cpuinfo_initialize() && fbgemmHasAvx2Support()) {
     RequantizeFixedPointAvx2(src, dst, len, params);
   } else {
     for (int i = 0; i < len; ++i) {
@@ -267,7 +268,8 @@ void RequantizeFixedPoint<uint8_t>(
     uint8_t* dst,
     const int len,
     const RequantizationParams& params) {
-  if (params.target_qparams.precision == 8 && fbgemmHasAvx2Support()) {
+  if (params.target_qparams.precision == 8 && cpuinfo_initialize() &&
+      fbgemmHasAvx2Support()) {
     RequantizeFixedPointAvx2(src, dst, len, params);
   } else {
     for (int i = 0; i < len; ++i) {
diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
index 875c9e1..7f43ced 100644
--- a/src/QuantUtilsAvx2.cc
+++ b/src/QuantUtilsAvx2.cc
@@ -142,16 +142,17 @@ void RequantizeAvx2(
     int len,
     const RequantizationParams& params) {
   DoNothing<> doNothingObj{};
+  int32_t Bq_zero_point[] = { 0 };
   ReQuantizeOutput<false /* FUSE_RELU */> requantizeObj(
       doNothingObj,
       &params.real_multiplier,
       params.target_qparams.zero_point,
-      0,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      len);
+      0, // Aq_zero_point
+      Bq_zero_point, // Bq_zero_point
+      nullptr, // row_offsets
+      nullptr, // col_offsets
+      nullptr, // bias
+      len); // ncol
   requantizeObj.f<inst_set_t::avx2>(dst, src, {0, 1, 0, len}, 0, 0);
 }