1 files changed, 38 insertions, 42 deletions
diff --git a/src/FbgemmI8Depthwise.cc b/src/FbgemmI8Depthwise.cc
index 7bca6c8..d8fe3a8 100644
--- a/src/FbgemmI8Depthwise.cc
+++ b/src/FbgemmI8Depthwise.cc
@@ -6,21 +6,17 @@
  */
 #include "FbgemmI8Depthwise.h"
 
-#include <algorithm>
-#include <array>
 #include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <tuple>
-#include <vector>
+#include <cmath> // for lrintf and sqrt
+#include <tuple> // for tie
 
-#include <x86intrin.h>
+#include <immintrin.h>
 
 using namespace std;
 
 namespace fbgemm {
 
-static array<array<int, 8>, 8> masks = {{
+static int masks[8][8] = {
   // NOTE: clang-format wants to use a different formatting but the current
   // formatting should be easier to read.
   {  0,  0,  0,  0,  0,  0,  0,  0,  },
@@ -31,7 +27,7 @@ static array<array<int, 8>, 8> masks = {{
   { -1, -1, -1, -1, -1,  0,  0,  0,  },
   { -1, -1, -1, -1, -1, -1,  0,  0,  },
   { -1, -1, -1, -1, -1, -1, -1,  0,  },
-}};
+};
 
 template <int KERNEL_PROD>
 PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix(
@@ -39,7 +35,7 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix(
     const int8_t* smat)
     : K_(K) {
   // Transpose the input matrix to make packing faster.
-  vector<int8_t> smat_transposed(K * KERNEL_PROD);
+  alignas(64) int8_t smat_transposed[K * KERNEL_PROD];
   for (int i = 0; i < KERNEL_PROD; ++i) {
     for (int j = 0; j < K; ++j) {
       smat_transposed[i * K + j] = smat[i + j * KERNEL_PROD];
@@ -108,25 +104,25 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix(
   // (12, 8), (12, 9), (12, 10), zero, ..., (15, 8), (15, 9), (15, 10), zero
   // (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero
   for (int k1 = 0; k1 < K; k1 += 32) {
-    array<__m256i, KERNEL_PROD> b_v;
+    __m256i b_v[KERNEL_PROD];
     int remainder = K - k1;
     if (remainder < 32) {
       __m256i mask_v = _mm256_loadu_si256(
-          reinterpret_cast<const __m256i*>(masks[remainder / 4].data()));
+          reinterpret_cast<const __m256i*>(masks[remainder / 4]));
       for (int i = 0; i < KERNEL_PROD; ++i) {
         b_v[i] = _mm256_maskload_epi32(
-            reinterpret_cast<const int*>(smat_transposed.data() + i * K + k1),
+            reinterpret_cast<const int*>(smat_transposed + i * K + k1),
             mask_v);
       }
     } else {
       for (int i = 0; i < KERNEL_PROD; ++i) {
         b_v[i] = _mm256_lddqu_si256(reinterpret_cast<const __m256i*>(
-            smat_transposed.data() + i * K + k1));
+            smat_transposed + i * K + k1));
       }
     }
 
     // Interleave 2 SIMD registers
-    array<__m256i, KERNEL_PROD_ALIGNED> b_interleaved_epi16;
+    __m256i b_interleaved_epi16[KERNEL_PROD_ALIGNED];
     __m256i zero_v = _mm256_setzero_si256();
     for (int i = 0; i < KERNEL_PROD_ALIGNED / 2; ++i) {
       if (2 * i + 1 >= KERNEL_PROD) {
@@ -142,7 +138,7 @@ PackedDepthWiseConvMatrix<KERNEL_PROD>::PackedDepthWiseConvMatrix(
     }
 
     // Interleave 4 SIMD registers
-    array<__m256i, KERNEL_PROD_ALIGNED> b_interleaved_epi32;
+    __m256i b_interleaved_epi32[KERNEL_PROD_ALIGNED];
     for (int i = 0; i < KERNEL_PROD_ALIGNED / 4; ++i) {
       b_interleaved_epi32[4 * i] = _mm256_unpacklo_epi16(
           b_interleaved_epi16[4 * i], b_interleaved_epi16[4 * i + 2]);
@@ -384,8 +380,8 @@ static inline __attribute__((always_inline)) void inner_prod_packed_(
     int32_t* C,
     int remainder,
     __m256i* a_sum = nullptr) {
-  array<__m256i, 4> c, c_temp;
-  array<__m256i, 2> a_sum_temp{};
+  __m256i c[4], c_temp[4];
+  __m256i a_sum_temp[2] = {0, 0};
 
   int k = 0;
   if (K >= 4) {
@@ -399,7 +395,7 @@ static inline __attribute__((always_inline)) void inner_prod_packed_(
         &c[1],
         &c[2],
         &c[3],
-        a_sum_temp.data());
+        a_sum_temp);
 
     for (k = 4; k < K / 4 * 4; k += 4) {
       madd_epi16x4_packed<SUM_A>(
@@ -412,7 +408,7 @@ static inline __attribute__((always_inline)) void inner_prod_packed_(
           &c_temp[1],
           &c_temp[2],
           &c_temp[3],
-          a_sum_temp.data());
+          a_sum_temp);
 
       c[0] = _mm256_add_epi32(c[0], c_temp[0]);
       c[1] = _mm256_add_epi32(c[1], c_temp[1]);
@@ -436,7 +432,7 @@ static inline __attribute__((always_inline)) void inner_prod_packed_(
         &c_temp[1],
         &c_temp[2],
         &c_temp[3],
-        a_sum_temp.data());
+        a_sum_temp);
 
     c[0] = _mm256_add_epi32(c[0], c_temp[0]);
     c[1] = _mm256_add_epi32(c[1], c_temp[1]);
@@ -457,7 +453,7 @@ static inline __attribute__((always_inline)) void inner_prod_packed_(
   } else {
     if (K - k == 1) {
       madd_epi16_packed<SUM_A>(
-          a_v[k], Bp + k, &c[0], &c[1], &c[2], &c[3], a_sum_temp.data());
+          a_v[k], Bp + k, &c[0], &c[1], &c[2], &c[3], a_sum_temp);
     } else if (K - k == 2) {
       madd_epi16x2_packed<SUM_A>(
           a_v[k],
@@ -467,7 +463,7 @@ static inline __attribute__((always_inline)) void inner_prod_packed_(
           &c[1],
           &c[2],
           &c[3],
-          a_sum_temp.data());
+          a_sum_temp);
     }
 
     c[0] = _mm256_add_epi32(c[0], c_temp[0]);
@@ -552,8 +548,8 @@ static inline __attribute__((always_inline)) void requantize_(
     multiplier_v = _mm256_set1_ps(*C_multiplier);
   }
 
-  __m256i min_v = _mm256_set1_epi8(numeric_limits<uint8_t>::min());
-  __m256i max_v = _mm256_set1_epi8(numeric_limits<uint8_t>::max());
+  __m256i min_v = _mm256_set1_epi8(static_cast<uint8_t>(0));
+  __m256i max_v = _mm256_set1_epi8(static_cast<uint8_t>(255));
 
   __m256i A_zero_point_v = _mm256_set1_epi32(A_zero_point);
   __m256i C_zero_point_epi16_v = _mm256_set1_epi16(C_zero_point);
@@ -790,7 +786,7 @@ static inline __attribute__((always_inline)) void inner_prod_3x3_packed_(
   __m256i mask_v = _mm256_setzero_si256();
   if (REMAINDER) {
     mask_v = _mm256_loadu_si256(
-        reinterpret_cast<const __m256i*>(masks[remainder / 4].data()));
+        reinterpret_cast<const __m256i*>(masks[remainder / 4]));
   }
 
   // The code below can be written as a simple R*S loop but the compiler
@@ -813,7 +809,7 @@ static inline __attribute__((always_inline)) void inner_prod_3x3_packed_(
   //     }
   //   }
   // }
-  array<__m256i, 9> a_v = {
+  __m256i a_v[9] = {
       A_zero_point_v,
       A_zero_point_v,
       A_zero_point_v,
@@ -861,13 +857,13 @@ static inline __attribute__((always_inline)) void inner_prod_3x3_packed_(
     }
   }
 
-  array<__m256i, 4> a_sum;
+  __m256i a_sum[4];
   inner_prod_3x3_packed_<SUM_A, REMAINDER>(
-      a_v.data(),
+      a_v,
       reinterpret_cast<const __m256i*>(Bp),
       C,
       remainder,
-      a_sum.data());
+      a_sum);
   if (SUM_A) {
     __m256i B_zero_point_v;
     for (int i = 0; i < (REMAINDER ? (remainder / 8) : 4); ++i) {
@@ -907,7 +903,7 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
   __m256i mask_v = _mm256_setzero_si256();
   if (REMAINDER) {
     mask_v = _mm256_loadu_si256(
-        reinterpret_cast<const __m256i*>(masks[remainder / 4].data()));
+        reinterpret_cast<const __m256i*>(masks[remainder / 4]));
   }
 
   // The code below can be written as a simple R*S loop but the compiler
@@ -930,7 +926,7 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
   //     }
   //   }
   // }
-  array<__m256i, 8> a_v;
+  __m256i a_v[8];
   a_v[0] = A_zero_point_v;
   a_v[1] = A_zero_point_v;
   a_v[2] = A_zero_point_v;
@@ -975,13 +971,13 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
     }
   }
 
-  array<__m256i, 4> a_sum;
+  __m256i a_sum[4];
   inner_prod_packed_<8, SUM_A, REMAINDER>(
-      a_v.data(),
+      a_v,
       reinterpret_cast<const __m256i*>(Bp),
       C,
       remainder,
-      a_sum.data());
+      a_sum);
 
   a_v[0] = A_zero_point_v;
   a_v[1] = A_zero_point_v;
@@ -1032,13 +1028,13 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
     }
   }
 
-  array<__m256i, 4> a_sum_temp;
+  __m256i a_sum_temp[4];
   inner_prod_packed_<8, SUM_A, REMAINDER, true /* acc */>(
-      a_v.data(),
+      a_v,
       reinterpret_cast<const __m256i*>(Bp) + 8,
       C,
       remainder,
-      a_sum_temp.data());
+      a_sum_temp);
   if (SUM_A) {
     a_sum[0] = _mm256_add_epi32(a_sum[0], a_sum_temp[0]);
     a_sum[1] = _mm256_add_epi32(a_sum[1], a_sum_temp[1]);
@@ -1093,11 +1089,11 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
   }
 
   inner_prod_packed_<8, SUM_A, REMAINDER, true /* acc */>(
-      a_v.data(),
+      a_v,
       reinterpret_cast<const __m256i*>(Bp) + 16,
       C,
       remainder,
-      a_sum_temp.data());
+      a_sum_temp);
   if (SUM_A) {
     a_sum[0] = _mm256_add_epi32(a_sum[0], a_sum_temp[0]);
     a_sum[1] = _mm256_add_epi32(a_sum[1], a_sum_temp[1]);
@@ -1124,11 +1120,11 @@ static inline __attribute__((always_inline)) void inner_prod_3x3x3_packed_(
   }
 
   inner_prod_packed_<3, SUM_A, REMAINDER, true /* acc */>(
-      a_v.data(),
+      a_v,
       reinterpret_cast<const __m256i*>(Bp) + 24,
       C,
       remainder,
-      a_sum_temp.data());
+      a_sum_temp);
 
   if (SUM_A) {
     a_sum[0] = _mm256_add_epi32(a_sum[0], a_sum_temp[0]);