diff options
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | avx512_gemm.h | 4 | ||||
-rw-r--r-- | benchmark.cc | 6 | ||||
-rw-r--r-- | cops.h | 17 | ||||
-rw-r--r-- | example.cc | 6 | ||||
-rw-r--r-- | interleave.h | 8 | ||||
-rw-r--r-- | multiply.h | 8 | ||||
-rw-r--r-- | test/multiply_test.cc | 36 |
8 files changed, 45 insertions, 42 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 7bfedcf..0004e01 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") # Working around https://bugs.llvm.org/show_bug.cgi?id=41482 # Anything compiled with clang might not work properly in SSE2/SSSE3 world message("${Orange}Compiling with Clang and using -mavx2 due to https://bugs.llvm.org/show_bug.cgi?id=41482. Support for SSE2/SSSE3 is likely broken.${ColourReset}") - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") endif() diff --git a/avx512_gemm.h b/avx512_gemm.h index b991cf5..4719fae 100644 --- a/avx512_gemm.h +++ b/avx512_gemm.h @@ -233,9 +233,9 @@ struct AVX512_8bit { // Added for AVX512. Integer zeros = setzero_si<Integer>(); // Go over 8 columns of B at a time. - for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { + for (Index B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { // Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once. - for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { + for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { // Iterate over shared (inner) dimension. const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width); const Integer *A_end = A_live + simd_width; diff --git a/benchmark.cc b/benchmark.cc index b16a04e..2f50045 100644 --- a/benchmark.cc +++ b/benchmark.cc @@ -27,7 +27,7 @@ float MaxAbsoluteBaseline(const float *begin, const float *end) { void BenchmarkMaxAbsolute() { const Index size = 4096 * 4096; AlignedVector<float> v(size); - for (int i = 0; i < size; ++i) { + for (Index i = 0; i < size; ++i) { v[i] = (float)rand() / (float)RAND_MAX; } std::vector<uint64_t> stats; @@ -49,11 +49,11 @@ struct RandomMatrices { RandomMatrices(Index A_rows_in, Index width_in, Index B_cols_in) : A_rows(A_rows_in), width(width_in), B_cols(B_cols_in), A(A_rows * width), B(width * B_cols) { - for (int i = 0; i < A_rows * width; i++) { + for (Index i = 0; i < A_rows * width; i++) { A[i] = ((float)rand()/(float)RAND_MAX)*2.0f - 1.0f; } - for (int i = 0; i < B_cols * width; i++) { + for (Index i = 0; i < B_cols * width; i++) { B[i] = ((float)rand()/(float)RAND_MAX)*2.0f - 1.0f; } } @@ -141,11 +141,13 @@ class ReLU { class OnSSE2 { public: INTGEMM_SSE2 explicit OnSSE2(const ReLU& from) - : C_(from.C_), zeros_(setzero_ps<__m128>()), unquant_mult_(set1_ps<__m128>(from.unquant_mult_)) { + : C_(from.C_), unquant_mult_(set1_ps<__m128>(from.unquant_mult_)) { assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m128i) == 0); } INTGEMM_SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) { + static const auto zeros_ = setzero_ps<__m128>(); + auto unquantized0123 = unquantize(result.pack0123, unquant_mult_); auto nonnegative0123 = max_ps(zeros_, unquantized0123); storeu_ps(C_ + rowIDX*cols + colIDX, nonnegative0123); @@ -158,19 +160,20 @@ class ReLU { private: float* C_; __m128 unquant_mult_; - __m128 zeros_; }; - using OnSSSE2 = OnSSE2; + using OnSSSE3 = OnSSE2; class OnAVX2 { public: INTGEMM_AVX2 explicit OnAVX2(const ReLU& from) - : C_(from.C_), zeros_(setzero_ps<__m256>()), unquant_mult_(set1_ps<__m256>(from.unquant_mult_)) { + : C_(from.C_), unquant_mult_(set1_ps<__m256>(from.unquant_mult_)) { assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m256i) == 0); } INTGEMM_AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) { + static const auto zeros_ = setzero_ps<__m256>(); + auto nonnegative = max_ps(zeros_, unquantize(result, unquant_mult_)); storeu_ps(C_ + rowIDX*cols + colIDX, nonnegative); } @@ -178,18 +181,19 @@ class ReLU { private: float* C_; __m256 unquant_mult_; - __m256 zeros_; }; #ifndef INTGEMM_NO_AVX512 class OnAVX512 { public: INTGEMM_AVX512BW explicit OnAVX512(const ReLU& from) - : C_(from.C_), zeros_(setzero_ps<__m512>()), unquant_mult_(set1_ps<__m512>(from.unquant_mult_)) { + : C_(from.C_), unquant_mult_(set1_ps<__m512>(from.unquant_mult_)) { assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m512i) == 0); } INTGEMM_AVX512BW inline void operator()(Index rowIDX, Index cols, Index colIDX, __m512i result) { + static const auto zeros_ = setzero_ps<__m512>(); + auto nonnegative = max_ps(zeros_, unquantize(result, unquant_mult_)); storeu_ps(C_ + rowIDX*cols + colIDX, nonnegative); } @@ -197,7 +201,6 @@ class ReLU { private: float* C_; __m512 unquant_mult_; - __m512 zeros_; }; #endif @@ -22,16 +22,16 @@ int main() { // Fill with random values in range [-2, 2]. srand(1); - for (int i = 0; i < A_rows * width; ++i) { + for (Index i = 0; i < A_rows * width; ++i) { A[i] = ((float)rand()/(float)RAND_MAX)*4.0f - 2.0f; } - for (int i = 0; i < width * B_cols; ++i) { + for (Index i = 0; i < width * B_cols; ++i) { B[i] = ((float)rand()/(float)RAND_MAX)*4.0f - 2.0f; } // Compute the top left corner of C as a sanity check. float top_left_reference = 0.0; - for (int w = 0; w < width; ++w) { + for (Index w = 0; w < width; ++w) { top_left_reference += A[w] * B[w * B_cols]; } diff --git a/interleave.h b/interleave.h index 8b7484e..76e8a5a 100644 --- a/interleave.h +++ b/interleave.h @@ -221,8 +221,8 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ Register *output = reinterpret_cast<Register*>(output_shadow); \ assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \ - for (int c = 0; c < cols; c += kColStride) { \ - for (int r = 0; r < rows; r += sizeof(Register), output += 8) { \ + for (Index c = 0; c < cols; c += kColStride) { \ + for (Index r = 0; r < rows; r += sizeof(Register), output += 8) { \ /* Quantize and perform a transpose with height sizeof(Register) and width 8. \ This isn't quite Transpose8InLane because it's half the number of columns, \ so each register starts with two rows instead of being one row. \ @@ -254,8 +254,8 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \ Register *output = reinterpret_cast<Register*>(output_shadow); \ assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \ - for (int c = 0; c < cols; c += 8) { \ - for (int r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \ + for (Index c = 0; c < cols; c += 8) { \ + for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \ /* gcc unrolls this loop and uses registers for output[k]*/ \ for (int k = 0; k < 8; ++k) { \ output[k] = q.ForReshape(input + cols * (r + k) + c, cols); \ @@ -135,9 +135,9 @@ INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i) const int simd_width = width / (sizeof(Integer) / sizeof(int16_t)); \ typename WriteC::WriteCSubType write_C(C); \ const Integer *B0_col = reinterpret_cast<const Integer *>(B); \ - for (int B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \ + for (Index B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \ /* Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \ - for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \ + for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \ const Integer *A_row = reinterpret_cast<const Integer*>(A + A_rowidx * width); \ /* These will be packed 32-bit integers containing sums for each row of B multiplied by the row of A. \ Iterate over shared (inner) dimension.*/ \ @@ -340,9 +340,9 @@ template <class WriteC> target static void Multiply(const int8_t *A, const int8_ const Integer *B0_col = reinterpret_cast<const Integer*>(B); \ typename WriteC::WriteCSubType c_writer(C); \ /*Go over 8 columns of B at a time.*/ \ - for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \ + for (Index B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \ /*Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \ - for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \ + for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \ /*Iterate over shared (inner) dimension.*/ \ const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width); \ const Integer *A_end = A_live + simd_width; \ diff --git a/test/multiply_test.cc b/test/multiply_test.cc index 6feb79f..f364ca6 100644 --- a/test/multiply_test.cc +++ b/test/multiply_test.cc @@ -34,8 +34,8 @@ template <class V> void SlowRearrangeTile(const V *from, V *to, int simd, int un } template <class V> void SlowRearrange(const V *from, V *to, int simd, int unroll, Index rows, Index cols) { - for (int c = 0; c < cols; c += unroll) { - for (int r = 0; r < rows; r += simd) { + for (Index c = 0; c < cols; c += unroll) { + for (Index r = 0; r < rows; r += simd) { SlowRearrangeTile(from + cols * r + c, to, simd, unroll, cols); to += unroll * simd; } @@ -43,8 +43,8 @@ template <class V> void SlowRearrange(const V *from, V *to, int simd, int unroll } template <class V> void SlowTranspose(const V *from, V *to, Index rows, Index cols) { - for (int r = 0; r < rows; ++r) { - for (int c = 0; c < cols; ++c) { + for (Index r = 0; r < rows; ++r) { + for (Index c = 0; c < cols; ++c) { to[rows * c + r] = from[cols * r + c]; } } @@ -89,8 +89,8 @@ INTGEMM_SSSE3 TEST_CASE("Transpose 8", "[transpose]") { template <class T> std::string PrintMatrix(const T *mem, Index rows, Index cols) { std::ostringstream out; - for (int r = 0; r < rows; ++r) { - for (int c = 0; c < cols; ++c) { + for (Index r = 0; r < rows; ++r) { + for (Index c = 0; c < cols; ++c) { out << std::setw(4) << (int64_t) mem[r * cols + c] << ' '; } out << '\n'; @@ -104,7 +104,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) { std::uniform_real_distribution<float> dist(-129.0, 129.0); // Create array. AlignedVector<float> input(rows * cols); - for (int i = 0; i < rows * cols; ++i) { + for (Index i = 0; i < rows * cols; ++i) { input.get()[i] = dist(gen); } @@ -158,7 +158,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1 // Go somewhat out of range too. std::uniform_real_distribution<float> dist(-129.0, 129.0); AlignedVector<float> input(rows * cols); - for (int i = 0; i < rows * cols; ++i) { + for (Index i = 0; i < rows * cols; ++i) { input.get()[i] = dist(gen); } typedef typename Routine::Integer Integer; @@ -177,7 +177,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1 // Select columns manually in float space. AlignedVector<float> selected(rows * kSelectCols); - for (int r = 0; r < rows; ++r) { + for (Index r = 0; r < rows; ++r) { for (int c = 0; c < kSelectCols; ++c) { assert(c + r * kSelectCols < rows * kSelectCols); selected[c + r * kSelectCols] = input[select_cols[c] + r * cols]; @@ -217,7 +217,7 @@ TEST_CASE("SelectColumnsB SSE2", "[select]") { template <class Register> void TestMax() { Register r = set1_ps<Register>(-2.0); - for (int i = 0; i < sizeof(Register) / sizeof(float); ++i) { + for (std::size_t i = 0; i < sizeof(Register) / sizeof(float); ++i) { Register c = r; reinterpret_cast<float*>(&c)[i] = -1.0; CHECK_MESSAGE((MaxFloat32(c) == -1.0), "MaxFloat32 produced " << MaxFloat32(c)); @@ -294,10 +294,10 @@ TEST_CASE("MaxAbsolute AVX512F", "[max]") { // SOFTWARE. // Compute A*B slowly in floats. void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index width, Index B_cols, const float *bias=nullptr) { - for (int r = 0; r < A_rows; ++r) { - for (int c = 0; c < B_cols; ++c) { + for (Index r = 0; r < A_rows; ++r) { + for (Index c = 0; c < B_cols; ++c) { float sum = 0.0f; - for (int k = 0; k < width; ++k) { + for (Index k = 0; k < width; ++k) { sum += A[r * width + k] * B[k * B_cols + c]; } if (bias) { @@ -311,10 +311,10 @@ void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index // Compute A*B slowly from integers. template <class Integer> void SlowRefInt(const Integer *A, const Integer *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias=nullptr) { - for (int r = 0; r < A_rows; ++r) { - for (int c = 0; c < B_cols; ++c) { + for (Index r = 0; r < A_rows; ++r) { + for (Index c = 0; c < B_cols; ++c) { int32_t sum = 0; - for (int k = 0; k < width; ++k) { + for (Index k = 0; k < width; ++k) { sum += static_cast<int16_t>(A[r * width + k]) * static_cast<int16_t>(B[k * B_cols + c]); } if (bias) { @@ -353,10 +353,10 @@ template <class Routine, class WriteC> void TestMultiply(Index A_rows, Index wid AlignedVector<float> B(width * B_cols); std::mt19937 gen; std::uniform_real_distribution<float> dist(-1.0f, 1.0f); - for (int i = 0; i < A_rows * width; i++) { + for (Index i = 0; i < A_rows * width; i++) { A.get()[i] = dist(gen); } - for (int i = 0; i < width * B_cols; ++i) { + for (Index i = 0; i < width * B_cols; ++i) { B.get()[i] = dist(gen); } |