10 files changed, 94 insertions, 32 deletions
diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index c5f9563..8f9f6cd 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -413,6 +413,18 @@ class FBGEMM_API PackBMatrix final
       const BlockingFactors* params = nullptr);
 
   /**
+   * This constructor accepts pre-packed matrix as an input.
+   * And, it skips the actual packing procedure.
+   */
+  PackBMatrix(matrix_op_t trans,
+              std::int32_t nRow,
+              std::int32_t nCol,
+              inpType* prepackedmat,
+              std::int32_t ld,
+              int groups = 1,
+              const BlockingFactors* params = nullptr);
+
+  /**
    * Weight matrices are usually constant so worth pre-packing.
    */
   bool isPrePacked() const {
diff --git a/include/fbgemm/FbgemmFP16.h b/include/fbgemm/FbgemmFP16.h
index ed56e31..7d1b031 100644
--- a/include/fbgemm/FbgemmFP16.h
+++ b/include/fbgemm/FbgemmFP16.h
@@ -223,7 +223,7 @@ class PackedGemmMatrixFP16 {
   }
 
   int matSize() const {
-    return size_;
+    return (int)size_;
   }
   int numRows() const {
     return nrow_;
diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h
index 370a29a..fda5262 100644
--- a/include/fbgemm/Types.h
+++ b/include/fbgemm/Types.h
@@ -36,11 +36,11 @@ static inline float16 cpu_float2half_rn(float f) {
 
   // Get rid of +Inf/-Inf, +0/-0.
   if (u > 0x477fefff) {
-    ret = sign | 0x7c00U;
+    ret = (fbgemm::float16)(sign | 0x7c00U);
     return ret;
   }
   if (u < 0x33000001) {
-    ret = (sign | 0x0000);
+    ret = (fbgemm::float16)(sign | 0x0000);
     return ret;
   }
 
@@ -70,7 +70,7 @@ static inline float16 cpu_float2half_rn(float f) {
     }
   }
 
-  ret = (sign | (exponent << 10) | mantissa);
+  ret = (fbgemm::float16)(sign | (exponent << 10) | mantissa);
 
   return ret;
 }
diff --git a/src/FbgemmI8Depthwise2DAvx2-inl.h b/src/FbgemmI8Depthwise2DAvx2-inl.h
index 7488a3c..a07dcd7 100644
--- a/src/FbgemmI8Depthwise2DAvx2-inl.h
+++ b/src/FbgemmI8Depthwise2DAvx2-inl.h
@@ -318,7 +318,7 @@ static ALWAYS_INLINE void inner_prod_2d_packed_(
     std::int32_t* C,
     int remainder,
     std::int32_t* row_offsets) {
-  if (S == 3) {
+  //if constexpr (S == 3) {
     inner_prod_3x3_packed_<SUM_A, REMAINDER, PER_CHANNEL_QUANTIZATION>(
         H,
         W,
@@ -332,22 +332,22 @@ static ALWAYS_INLINE void inner_prod_2d_packed_(
         C,
         remainder,
         row_offsets);
-  } else {
-    assert(S == 5);
-    inner_prod_5x5_packed_<SUM_A, REMAINDER, PER_CHANNEL_QUANTIZATION>(
-        H,
-        W,
-        K,
-        h_in,
-        w_in,
-        A,
-        A_zero_point,
-        Bp,
-        B_zero_point,
-        C,
-        remainder,
-        row_offsets);
-  }
+  //} else {
+  //  assert(S == 5);
+  //  inner_prod_5x5_packed_<SUM_A, REMAINDER, PER_CHANNEL_QUANTIZATION>(
+  //      H,
+  //      W,
+  //      K,
+  //      h_in,
+  //      w_in,
+  //      A,
+  //      A_zero_point,
+  //      Bp,
+  //      B_zero_point,
+  //      C,
+  //      remainder,
+  //      row_offsets);
+  //}
 }
 
 template <
diff --git a/src/FbgemmI8Depthwise3x3Avx2.cc b/src/FbgemmI8Depthwise3x3Avx2.cc
index b2ad049..523af73 100644
--- a/src/FbgemmI8Depthwise3x3Avx2.cc
+++ b/src/FbgemmI8Depthwise3x3Avx2.cc
@@ -4,7 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#define FBGEMM_EXPORTS
+//#define FBGEMM_EXPORTS
 #include "fbgemm/FbgemmI8DepthwiseAvx2.h"
 
 #include <stdexcept> // for logic_error
diff --git a/src/FbgemmI8DepthwiseAvx2-inl.h b/src/FbgemmI8DepthwiseAvx2-inl.h
index 18dda3b..7128916 100644
--- a/src/FbgemmI8DepthwiseAvx2-inl.h
+++ b/src/FbgemmI8DepthwiseAvx2-inl.h
@@ -230,7 +230,7 @@ static ALWAYS_INLINE void inner_prod_packed_(
   __m256i a_sum_temp[2] = {0, 0};
 
   int k = 0;
-  if (K >= 4) {
+  //if (K >= 4) {
     madd_epi16x4_packed<SUM_A>(
         a_v[0],
         a_v[1],
@@ -261,12 +261,12 @@ static ALWAYS_INLINE void inner_prod_packed_(
       c[2] = _mm256_add_epi32(c[2], c_temp[2]);
       c[3] = _mm256_add_epi32(c[3], c_temp[3]);
     }
-  } else {
-    c[0] = _mm256_setzero_si256();
-    c[1] = _mm256_setzero_si256();
-    c[2] = _mm256_setzero_si256();
-    c[3] = _mm256_setzero_si256();
-  }
+  //} else {
+  //  c[0] = _mm256_setzero_si256();
+  //  c[1] = _mm256_setzero_si256();
+  //  c[2] = _mm256_setzero_si256();
+  //  c[3] = _mm256_setzero_si256();
+  //}
 
   if (K - k == 3) {
     madd_epi16x3_packed<SUM_A>(
diff --git a/src/FbgemmI8DepthwisePerChannelQuantAvx2.cc b/src/FbgemmI8DepthwisePerChannelQuantAvx2.cc
index ec26287..883558b 100644
--- a/src/FbgemmI8DepthwisePerChannelQuantAvx2.cc
+++ b/src/FbgemmI8DepthwisePerChannelQuantAvx2.cc
@@ -4,7 +4,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-#define FBGEMM_EXPORTS
+//#define FBGEMM_EXPORTS
 #include "fbgemm/FbgemmI8DepthwiseAvx2.h"
 
 #include <stdexcept> // for logic_error
diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
index c271c4c..dc8ce5c 100644
--- a/src/PackBMatrix.cc
+++ b/src/PackBMatrix.cc
@@ -239,6 +239,54 @@ PackBMatrix<T, accT>::PackBMatrix(
 }
 
 template <typename T, typename accT>
+PackBMatrix<T, accT>::PackBMatrix(matrix_op_t trans,
+                                  int32_t nRow,
+                                  int32_t nCol,
+                                  inpType* prepackedmat,
+                                  int32_t ld,
+                                  int groups,
+                                  const BlockingFactors* params)
+    : PackMatrix<PackBMatrix<T, accT>, T, accT>(nRow, nCol, prepackedmat, groups, params),
+      trans_(trans),
+      smat_(nullptr),
+      ld_(ld) {
+  if(!cpuinfo_initialize()) {
+    throw std::runtime_error("Failed to initialize cpuinfo!");
+  }
+  if(params) {
+    if(fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
+      BaseType::brow_ = params->KCB;
+      BaseType::bcol_ = params->NCB;
+      row_interleave_ = params->ROW_INTERLEAVE;
+    } else {
+      // TODO: Have default slower path
+      assert(0 && "unsupported architecure");
+    }
+  } else {
+    if(fbgemmHasAvx512Support()) {
+      BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx512>::KCB;
+      BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx512>::NCB;
+      row_interleave_ = PackingTraits<T, accT, inst_set_t::avx512>::ROW_INTERLEAVE;
+    } else if(fbgemmHasAvx2Support()) {
+      BaseType::brow_ = PackingTraits<T, accT, inst_set_t::avx2>::KCB;
+      BaseType::bcol_ = PackingTraits<T, accT, inst_set_t::avx2>::NCB;
+      row_interleave_ = PackingTraits<T, accT, inst_set_t::avx2>::ROW_INTERLEAVE;
+    } else {
+      // Error
+      assert(0 && "unknown architecure");
+    }
+  }
+  if(BaseType::numRows() % groups != 0) {
+    throw std::runtime_error("groups = " + std::to_string(groups)
+                             + " does not divide numRows = " + std::to_string(BaseType::numRows()));
+  }
+
+  // blocking for one group
+  block_type_t block{0, BaseType::numRows() / BaseType::numGroups(), 0, BaseType::numCols()};
+  BaseType::packedBlock(block);
+}
+
+template <typename T, typename accT>
 void PackBMatrix<T, accT>::pack_unpack_(
     const block_type_t& block,
     T* unpack_buf,
diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
index 8e36855..a1e6858 100644
--- a/src/QuantUtilsAvx2.cc
+++ b/src/QuantUtilsAvx2.cc
@@ -163,7 +163,8 @@ void FindMinMax(const float* a, float* min, float* max, int len) {
 ////////////////////////////////////////////////////////////////////////////////
 // Requantization (with floats)
 
-#ifdef __AVX2__
+//#ifdef __AVX2__
+#if 1
 void RequantizeAvx2(
     const int32_t* src,
     uint8_t* dst,
diff --git a/src/TransposeUtilsAvx2.h b/src/TransposeUtilsAvx2.h
index a405745..9925eb6 100644
--- a/src/TransposeUtilsAvx2.h
+++ b/src/TransposeUtilsAvx2.h
@@ -13,7 +13,8 @@ namespace fbgemm {
 
 namespace internal {
 
-#ifdef __AVX2__
+//#ifdef __AVX2__
+#if 1
 // NOTE: Make sure every function defined in here has static linkage because
 // this header file is included by UtilsAvx512.cc compiled with -mavx512f option