8 files changed, 45 insertions, 42 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7bfedcf..0004e01 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
   # Working around https://bugs.llvm.org/show_bug.cgi?id=41482
   # Anything compiled with clang might not work properly in SSE2/SSSE3 world
   message("${Orange}Compiling with Clang and using -mavx2 due to https://bugs.llvm.org/show_bug.cgi?id=41482. Support for SSE2/SSSE3 is likely broken.${ColourReset}")
-  SET(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -mavx2")
+  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -mavx2")
 endif()
 
 
diff --git a/avx512_gemm.h b/avx512_gemm.h
index b991cf5..4719fae 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -233,9 +233,9 @@ struct AVX512_8bit {
   // Added for AVX512.
   Integer zeros = setzero_si<Integer>();
   // Go over 8 columns of B at a time.
-  for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) {
+  for (Index B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) {
     // Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.
-    for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
+    for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
       // Iterate over shared (inner) dimension.
       const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width);
       const Integer *A_end = A_live + simd_width;
diff --git a/benchmark.cc b/benchmark.cc
index b16a04e..2f50045 100644
--- a/benchmark.cc
+++ b/benchmark.cc
@@ -27,7 +27,7 @@ float MaxAbsoluteBaseline(const float *begin, const float *end) {
 void BenchmarkMaxAbsolute() {
   const Index size = 4096 * 4096;
   AlignedVector<float> v(size);
-  for (int i = 0; i < size; ++i) {
+  for (Index i = 0; i < size; ++i) {
     v[i] = (float)rand() / (float)RAND_MAX;
   }
   std::vector<uint64_t> stats;
@@ -49,11 +49,11 @@ struct RandomMatrices {
   RandomMatrices(Index A_rows_in, Index width_in, Index B_cols_in) :
     A_rows(A_rows_in), width(width_in), B_cols(B_cols_in),
     A(A_rows * width), B(width * B_cols) {
-    for (int i = 0; i < A_rows * width; i++) {
+    for (Index i = 0; i < A_rows * width; i++) {
         A[i] = ((float)rand()/(float)RAND_MAX)*2.0f - 1.0f;
     }
     
-    for (int i = 0; i < B_cols * width; i++) {
+    for (Index i = 0; i < B_cols * width; i++) {
         B[i] = ((float)rand()/(float)RAND_MAX)*2.0f - 1.0f;
     }
   }
diff --git a/cops.h b/cops.h
index 60dd206..67796f5 100644
--- a/cops.h
+++ b/cops.h
@@ -141,11 +141,13 @@ class ReLU {
     class OnSSE2 {
       public:
         INTGEMM_SSE2 explicit OnSSE2(const ReLU& from)
-          : C_(from.C_), zeros_(setzero_ps<__m128>()), unquant_mult_(set1_ps<__m128>(from.unquant_mult_)) {
+          : C_(from.C_), unquant_mult_(set1_ps<__m128>(from.unquant_mult_)) {
           assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m128i) == 0);
         }
 
         INTGEMM_SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) {
+          static const auto zeros_ = setzero_ps<__m128>();
+
           auto unquantized0123 = unquantize(result.pack0123, unquant_mult_);
           auto nonnegative0123 = max_ps(zeros_, unquantized0123);
           storeu_ps(C_ + rowIDX*cols + colIDX, nonnegative0123);
@@ -158,19 +160,20 @@ class ReLU {
       private:
         float* C_;
         __m128 unquant_mult_;
-        __m128 zeros_;
     };
 
-    using OnSSSE2 = OnSSE2;
+    using OnSSSE3 = OnSSE2;
 
     class OnAVX2 {
       public:
         INTGEMM_AVX2 explicit OnAVX2(const ReLU& from)
-          : C_(from.C_), zeros_(setzero_ps<__m256>()), unquant_mult_(set1_ps<__m256>(from.unquant_mult_)) {
+          : C_(from.C_), unquant_mult_(set1_ps<__m256>(from.unquant_mult_)) {
           assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m256i) == 0);
         }
 
         INTGEMM_AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) {
+          static const auto zeros_ = setzero_ps<__m256>();
+
           auto nonnegative = max_ps(zeros_, unquantize(result, unquant_mult_));
           storeu_ps(C_ + rowIDX*cols + colIDX, nonnegative);
         }
@@ -178,18 +181,19 @@ class ReLU {
       private:
         float* C_;
         __m256 unquant_mult_;
-        __m256 zeros_;
     };
 
 #ifndef INTGEMM_NO_AVX512
     class OnAVX512 {
       public:
         INTGEMM_AVX512BW explicit OnAVX512(const ReLU& from)
-          : C_(from.C_), zeros_(setzero_ps<__m512>()), unquant_mult_(set1_ps<__m512>(from.unquant_mult_)) {
+          : C_(from.C_), unquant_mult_(set1_ps<__m512>(from.unquant_mult_)) {
           assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m512i) == 0);
         }
 
         INTGEMM_AVX512BW inline void operator()(Index rowIDX, Index cols, Index colIDX, __m512i result) {
+          static const auto zeros_ = setzero_ps<__m512>();
+
           auto nonnegative = max_ps(zeros_, unquantize(result, unquant_mult_));
           storeu_ps(C_ + rowIDX*cols + colIDX, nonnegative);
         }
@@ -197,7 +201,6 @@ class ReLU {
       private:
         float* C_;
         __m512 unquant_mult_;
-        __m512 zeros_;
     };
 #endif
 
diff --git a/example.cc b/example.cc
index f4dbef1..e517f29 100644
--- a/example.cc
+++ b/example.cc
@@ -22,16 +22,16 @@ int main() {
 
   // Fill with random values in range [-2, 2].
   srand(1);
-  for (int i = 0; i < A_rows * width; ++i) {
+  for (Index i = 0; i < A_rows * width; ++i) {
     A[i] = ((float)rand()/(float)RAND_MAX)*4.0f - 2.0f;
   }
-  for (int i = 0; i < width * B_cols; ++i) {
+  for (Index i = 0; i < width * B_cols; ++i) {
     B[i] = ((float)rand()/(float)RAND_MAX)*4.0f - 2.0f;
   }
 
   // Compute the top left corner of C as a sanity check.
   float top_left_reference = 0.0;
-  for (int w = 0; w < width; ++w) {
+  for (Index w = 0; w < width; ++w) {
     top_left_reference += A[w] * B[w * B_cols];
   }
 
diff --git a/interleave.h b/interleave.h
index 8b7484e..76e8a5a 100644
--- a/interleave.h
+++ b/interleave.h
@@ -221,8 +221,8 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
   Register *output = reinterpret_cast<Register*>(output_shadow); \
   assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
-  for (int c = 0; c < cols; c += kColStride) { \
-    for (int r = 0; r < rows; r += sizeof(Register), output += 8) { \
+  for (Index c = 0; c < cols; c += kColStride) { \
+    for (Index r = 0; r < rows; r += sizeof(Register), output += 8) { \
       /* Quantize and perform a transpose with height sizeof(Register) and width 8. \
          This isn't quite Transpose8InLane because it's half the number of columns, \
          so each register starts with two rows instead of being one row. \
@@ -254,8 +254,8 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
   assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
   Register *output = reinterpret_cast<Register*>(output_shadow); \
   assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
-  for (int c = 0; c < cols; c += 8) { \
-    for (int r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \
+  for (Index c = 0; c < cols; c += 8) { \
+    for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \
       /* gcc unrolls this loop and uses registers for output[k]*/ \
       for (int k = 0; k < 8; ++k) { \
         output[k] = q.ForReshape(input + cols * (r + k) + c, cols); \
diff --git a/multiply.h b/multiply.h
index bf908eb..4642616 100644
--- a/multiply.h
+++ b/multiply.h
@@ -135,9 +135,9 @@ INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i)
   const int simd_width = width / (sizeof(Integer) / sizeof(int16_t)); \
   typename WriteC::WriteCSubType write_C(C); \
   const Integer *B0_col = reinterpret_cast<const Integer *>(B); \
-  for (int B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
+  for (Index B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
     /* Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.*/ \
-    for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
+    for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
       const Integer *A_row = reinterpret_cast<const Integer*>(A + A_rowidx * width); \
       /* These will be packed 32-bit integers containing sums for each row of B multiplied by the row of A. \
          Iterate over shared (inner) dimension.*/ \
@@ -340,9 +340,9 @@ template <class WriteC> target static void Multiply(const int8_t *A, const int8_
   const Integer *B0_col = reinterpret_cast<const Integer*>(B); \
   typename WriteC::WriteCSubType c_writer(C); \
   /*Go over 8 columns of B at a time.*/ \
-  for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
+  for (Index B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
     /*Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.*/ \
-    for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
+    for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
       /*Iterate over shared (inner) dimension.*/ \
       const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width); \
       const Integer *A_end = A_live + simd_width; \
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index 6feb79f..f364ca6 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -34,8 +34,8 @@ template <class V> void SlowRearrangeTile(const V *from, V *to, int simd, int un
 }
 
 template <class V> void SlowRearrange(const V *from, V *to, int simd, int unroll, Index rows, Index cols) {
-  for (int c = 0; c < cols; c += unroll) {
-    for (int r = 0; r < rows; r += simd) {
+  for (Index c = 0; c < cols; c += unroll) {
+    for (Index r = 0; r < rows; r += simd) {
       SlowRearrangeTile(from + cols * r + c, to, simd, unroll, cols);
       to += unroll * simd;
     }
@@ -43,8 +43,8 @@ template <class V> void SlowRearrange(const V *from, V *to, int simd, int unroll
 }
 
 template <class V> void SlowTranspose(const V *from, V *to, Index rows, Index cols) {
-  for (int r = 0; r < rows; ++r) {
-    for (int c = 0; c < cols; ++c) {
+  for (Index r = 0; r < rows; ++r) {
+    for (Index c = 0; c < cols; ++c) {
       to[rows * c + r] = from[cols * r + c];
     }
   }
@@ -89,8 +89,8 @@ INTGEMM_SSSE3 TEST_CASE("Transpose 8", "[transpose]") {
 
 template <class T> std::string PrintMatrix(const T *mem, Index rows, Index cols) {
   std::ostringstream out;
-  for (int r = 0; r < rows; ++r) {
-    for (int c = 0; c < cols; ++c) {
+  for (Index r = 0; r < rows; ++r) {
+    for (Index c = 0; c < cols; ++c) {
       out << std::setw(4) << (int64_t) mem[r * cols + c] << ' ';
     }
     out << '\n';
@@ -104,7 +104,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
   std::uniform_real_distribution<float> dist(-129.0, 129.0);
   // Create array.
   AlignedVector<float> input(rows * cols);
-  for (int i = 0; i < rows * cols; ++i) {
+  for (Index i = 0; i < rows * cols; ++i) {
     input.get()[i] = dist(gen);
   }
 
@@ -158,7 +158,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
   // Go somewhat out of range too.
   std::uniform_real_distribution<float> dist(-129.0, 129.0);
   AlignedVector<float> input(rows * cols);
-  for (int i = 0; i < rows * cols; ++i) {
+  for (Index i = 0; i < rows * cols; ++i) {
     input.get()[i] = dist(gen);
   }
   typedef typename Routine::Integer Integer;
@@ -177,7 +177,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
 
   // Select columns manually in float space.
   AlignedVector<float> selected(rows * kSelectCols);
-  for (int r = 0; r < rows; ++r) {
+  for (Index r = 0; r < rows; ++r) {
     for (int c = 0; c < kSelectCols; ++c) {
       assert(c + r * kSelectCols < rows * kSelectCols);
       selected[c + r * kSelectCols] = input[select_cols[c] + r * cols];
@@ -217,7 +217,7 @@ TEST_CASE("SelectColumnsB SSE2", "[select]") {
 
 template <class Register> void TestMax() {
   Register r = set1_ps<Register>(-2.0);
-  for (int i = 0; i < sizeof(Register) / sizeof(float); ++i) {
+  for (std::size_t i = 0; i < sizeof(Register) / sizeof(float); ++i) {
     Register c = r;
     reinterpret_cast<float*>(&c)[i] = -1.0;
     CHECK_MESSAGE((MaxFloat32(c) == -1.0), "MaxFloat32 produced " << MaxFloat32(c));
@@ -294,10 +294,10 @@ TEST_CASE("MaxAbsolute AVX512F", "[max]") {
 // SOFTWARE.
 // Compute A*B slowly in floats.
 void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index width, Index B_cols, const float *bias=nullptr) {
-  for (int r = 0; r < A_rows; ++r) {
-    for (int c = 0; c < B_cols; ++c) {
+  for (Index r = 0; r < A_rows; ++r) {
+    for (Index c = 0; c < B_cols; ++c) {
       float sum = 0.0f;
-      for (int k = 0; k < width; ++k) {
+      for (Index k = 0; k < width; ++k) {
         sum += A[r * width + k] * B[k * B_cols + c];
       }
       if (bias) {
@@ -311,10 +311,10 @@ void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index
 
 // Compute A*B slowly from integers.
 template <class Integer> void SlowRefInt(const Integer *A, const Integer *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias=nullptr) {
-  for (int r = 0; r < A_rows; ++r) {
-    for (int c = 0; c < B_cols; ++c) {
+  for (Index r = 0; r < A_rows; ++r) {
+    for (Index c = 0; c < B_cols; ++c) {
       int32_t sum = 0;
-      for (int k = 0; k < width; ++k) {
+      for (Index k = 0; k < width; ++k) {
         sum += static_cast<int16_t>(A[r * width + k]) * static_cast<int16_t>(B[k * B_cols + c]);
       }
       if (bias) {
@@ -353,10 +353,10 @@ template <class Routine, class WriteC> void TestMultiply(Index A_rows, Index wid
   AlignedVector<float> B(width * B_cols);
   std::mt19937 gen;
   std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (int i = 0; i < A_rows * width; i++) {
+  for (Index i = 0; i < A_rows * width; i++) {
     A.get()[i] = dist(gen);
   }
-  for (int i = 0; i < width * B_cols; ++i) {
+  for (Index i = 0; i < width * B_cols; ++i) {
     B.get()[i] = dist(gen);
   }