Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--avx512_gemm.h4
-rw-r--r--benchmark.cc6
-rw-r--r--cops.h17
-rw-r--r--example.cc6
-rw-r--r--interleave.h8
-rw-r--r--multiply.h8
-rw-r--r--test/multiply_test.cc36
8 files changed, 45 insertions, 42 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7bfedcf..0004e01 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
# Working around https://bugs.llvm.org/show_bug.cgi?id=41482
# Anything compiled with clang might not work properly in SSE2/SSSE3 world
message("${Orange}Compiling with Clang and using -mavx2 due to https://bugs.llvm.org/show_bug.cgi?id=41482. Support for SSE2/SSSE3 is likely broken.${ColourReset}")
- SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
endif()
diff --git a/avx512_gemm.h b/avx512_gemm.h
index b991cf5..4719fae 100644
--- a/avx512_gemm.h
+++ b/avx512_gemm.h
@@ -233,9 +233,9 @@ struct AVX512_8bit {
// Added for AVX512.
Integer zeros = setzero_si<Integer>();
// Go over 8 columns of B at a time.
- for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) {
+ for (Index B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) {
// Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.
- for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
+ for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
// Iterate over shared (inner) dimension.
const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width);
const Integer *A_end = A_live + simd_width;
diff --git a/benchmark.cc b/benchmark.cc
index b16a04e..2f50045 100644
--- a/benchmark.cc
+++ b/benchmark.cc
@@ -27,7 +27,7 @@ float MaxAbsoluteBaseline(const float *begin, const float *end) {
void BenchmarkMaxAbsolute() {
const Index size = 4096 * 4096;
AlignedVector<float> v(size);
- for (int i = 0; i < size; ++i) {
+ for (Index i = 0; i < size; ++i) {
v[i] = (float)rand() / (float)RAND_MAX;
}
std::vector<uint64_t> stats;
@@ -49,11 +49,11 @@ struct RandomMatrices {
RandomMatrices(Index A_rows_in, Index width_in, Index B_cols_in) :
A_rows(A_rows_in), width(width_in), B_cols(B_cols_in),
A(A_rows * width), B(width * B_cols) {
- for (int i = 0; i < A_rows * width; i++) {
+ for (Index i = 0; i < A_rows * width; i++) {
A[i] = ((float)rand()/(float)RAND_MAX)*2.0f - 1.0f;
}
- for (int i = 0; i < B_cols * width; i++) {
+ for (Index i = 0; i < B_cols * width; i++) {
B[i] = ((float)rand()/(float)RAND_MAX)*2.0f - 1.0f;
}
}
diff --git a/cops.h b/cops.h
index 60dd206..67796f5 100644
--- a/cops.h
+++ b/cops.h
@@ -141,11 +141,13 @@ class ReLU {
class OnSSE2 {
public:
INTGEMM_SSE2 explicit OnSSE2(const ReLU& from)
- : C_(from.C_), zeros_(setzero_ps<__m128>()), unquant_mult_(set1_ps<__m128>(from.unquant_mult_)) {
+ : C_(from.C_), unquant_mult_(set1_ps<__m128>(from.unquant_mult_)) {
assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m128i) == 0);
}
INTGEMM_SSE2 inline void operator()(Index rowIDX, Index cols, Index colIDX, MultiplyResult128 result) {
+ static const auto zeros_ = setzero_ps<__m128>();
+
auto unquantized0123 = unquantize(result.pack0123, unquant_mult_);
auto nonnegative0123 = max_ps(zeros_, unquantized0123);
storeu_ps(C_ + rowIDX*cols + colIDX, nonnegative0123);
@@ -158,19 +160,20 @@ class ReLU {
private:
float* C_;
__m128 unquant_mult_;
- __m128 zeros_;
};
- using OnSSSE2 = OnSSE2;
+ using OnSSSE3 = OnSSE2;
class OnAVX2 {
public:
INTGEMM_AVX2 explicit OnAVX2(const ReLU& from)
- : C_(from.C_), zeros_(setzero_ps<__m256>()), unquant_mult_(set1_ps<__m256>(from.unquant_mult_)) {
+ : C_(from.C_), unquant_mult_(set1_ps<__m256>(from.unquant_mult_)) {
assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m256i) == 0);
}
INTGEMM_AVX2 inline void operator()(Index rowIDX, Index cols, Index colIDX, __m256i result) {
+ static const auto zeros_ = setzero_ps<__m256>();
+
auto nonnegative = max_ps(zeros_, unquantize(result, unquant_mult_));
storeu_ps(C_ + rowIDX*cols + colIDX, nonnegative);
}
@@ -178,18 +181,19 @@ class ReLU {
private:
float* C_;
__m256 unquant_mult_;
- __m256 zeros_;
};
#ifndef INTGEMM_NO_AVX512
class OnAVX512 {
public:
INTGEMM_AVX512BW explicit OnAVX512(const ReLU& from)
- : C_(from.C_), zeros_(setzero_ps<__m512>()), unquant_mult_(set1_ps<__m512>(from.unquant_mult_)) {
+ : C_(from.C_), unquant_mult_(set1_ps<__m512>(from.unquant_mult_)) {
assert(reinterpret_cast<uintptr_t>(C_) % sizeof(__m512i) == 0);
}
INTGEMM_AVX512BW inline void operator()(Index rowIDX, Index cols, Index colIDX, __m512i result) {
+ static const auto zeros_ = setzero_ps<__m512>();
+
auto nonnegative = max_ps(zeros_, unquantize(result, unquant_mult_));
storeu_ps(C_ + rowIDX*cols + colIDX, nonnegative);
}
@@ -197,7 +201,6 @@ class ReLU {
private:
float* C_;
__m512 unquant_mult_;
- __m512 zeros_;
};
#endif
diff --git a/example.cc b/example.cc
index f4dbef1..e517f29 100644
--- a/example.cc
+++ b/example.cc
@@ -22,16 +22,16 @@ int main() {
// Fill with random values in range [-2, 2].
srand(1);
- for (int i = 0; i < A_rows * width; ++i) {
+ for (Index i = 0; i < A_rows * width; ++i) {
A[i] = ((float)rand()/(float)RAND_MAX)*4.0f - 2.0f;
}
- for (int i = 0; i < width * B_cols; ++i) {
+ for (Index i = 0; i < width * B_cols; ++i) {
B[i] = ((float)rand()/(float)RAND_MAX)*4.0f - 2.0f;
}
// Compute the top left corner of C as a sanity check.
float top_left_reference = 0.0;
- for (int w = 0; w < width; ++w) {
+ for (Index w = 0; w < width; ++w) {
top_left_reference += A[w] * B[w * B_cols];
}
diff --git a/interleave.h b/interleave.h
index 8b7484e..76e8a5a 100644
--- a/interleave.h
+++ b/interleave.h
@@ -221,8 +221,8 @@ target static inline void PrepareB(const float *input, int8_t *output_shadow, fl
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
Register *output = reinterpret_cast<Register*>(output_shadow); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
- for (int c = 0; c < cols; c += kColStride) { \
- for (int r = 0; r < rows; r += sizeof(Register), output += 8) { \
+ for (Index c = 0; c < cols; c += kColStride) { \
+ for (Index r = 0; r < rows; r += sizeof(Register), output += 8) { \
/* Quantize and perform a transpose with height sizeof(Register) and width 8. \
This isn't quite Transpose8InLane because it's half the number of columns, \
so each register starts with two rows instead of being one row. \
@@ -254,8 +254,8 @@ target static inline void PrepareB(const float *input, int16_t *output_shadow, f
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
Register *output = reinterpret_cast<Register*>(output_shadow); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
- for (int c = 0; c < cols; c += 8) { \
- for (int r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \
+ for (Index c = 0; c < cols; c += 8) { \
+ for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \
/* gcc unrolls this loop and uses registers for output[k]*/ \
for (int k = 0; k < 8; ++k) { \
output[k] = q.ForReshape(input + cols * (r + k) + c, cols); \
diff --git a/multiply.h b/multiply.h
index bf908eb..4642616 100644
--- a/multiply.h
+++ b/multiply.h
@@ -135,9 +135,9 @@ INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i)
const int simd_width = width / (sizeof(Integer) / sizeof(int16_t)); \
typename WriteC::WriteCSubType write_C(C); \
const Integer *B0_col = reinterpret_cast<const Integer *>(B); \
- for (int B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
+ for (Index B0_colidx = 0; B0_colidx < B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
/* Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \
- for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
+ for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
const Integer *A_row = reinterpret_cast<const Integer*>(A + A_rowidx * width); \
/* These will be packed 32-bit integers containing sums for each row of B multiplied by the row of A. \
Iterate over shared (inner) dimension.*/ \
@@ -340,9 +340,9 @@ template <class WriteC> target static void Multiply(const int8_t *A, const int8_
const Integer *B0_col = reinterpret_cast<const Integer*>(B); \
typename WriteC::WriteCSubType c_writer(C); \
/*Go over 8 columns of B at a time.*/ \
- for (int B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
+ for (Index B0_colidx = 0; B0_colidx != B_cols; B0_col += 8 * simd_width, B0_colidx += 8) { \
/*Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \
- for (int A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
+ for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
/*Iterate over shared (inner) dimension.*/ \
const Integer *A_live = reinterpret_cast<const Integer *>(A + A_rowidx * width); \
const Integer *A_end = A_live + simd_width; \
diff --git a/test/multiply_test.cc b/test/multiply_test.cc
index 6feb79f..f364ca6 100644
--- a/test/multiply_test.cc
+++ b/test/multiply_test.cc
@@ -34,8 +34,8 @@ template <class V> void SlowRearrangeTile(const V *from, V *to, int simd, int un
}
template <class V> void SlowRearrange(const V *from, V *to, int simd, int unroll, Index rows, Index cols) {
- for (int c = 0; c < cols; c += unroll) {
- for (int r = 0; r < rows; r += simd) {
+ for (Index c = 0; c < cols; c += unroll) {
+ for (Index r = 0; r < rows; r += simd) {
SlowRearrangeTile(from + cols * r + c, to, simd, unroll, cols);
to += unroll * simd;
}
@@ -43,8 +43,8 @@ template <class V> void SlowRearrange(const V *from, V *to, int simd, int unroll
}
template <class V> void SlowTranspose(const V *from, V *to, Index rows, Index cols) {
- for (int r = 0; r < rows; ++r) {
- for (int c = 0; c < cols; ++c) {
+ for (Index r = 0; r < rows; ++r) {
+ for (Index c = 0; c < cols; ++c) {
to[rows * c + r] = from[cols * r + c];
}
}
@@ -89,8 +89,8 @@ INTGEMM_SSSE3 TEST_CASE("Transpose 8", "[transpose]") {
template <class T> std::string PrintMatrix(const T *mem, Index rows, Index cols) {
std::ostringstream out;
- for (int r = 0; r < rows; ++r) {
- for (int c = 0; c < cols; ++c) {
+ for (Index r = 0; r < rows; ++r) {
+ for (Index c = 0; c < cols; ++c) {
out << std::setw(4) << (int64_t) mem[r * cols + c] << ' ';
}
out << '\n';
@@ -104,7 +104,7 @@ template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
std::uniform_real_distribution<float> dist(-129.0, 129.0);
// Create array.
AlignedVector<float> input(rows * cols);
- for (int i = 0; i < rows * cols; ++i) {
+ for (Index i = 0; i < rows * cols; ++i) {
input.get()[i] = dist(gen);
}
@@ -158,7 +158,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
// Go somewhat out of range too.
std::uniform_real_distribution<float> dist(-129.0, 129.0);
AlignedVector<float> input(rows * cols);
- for (int i = 0; i < rows * cols; ++i) {
+ for (Index i = 0; i < rows * cols; ++i) {
input.get()[i] = dist(gen);
}
typedef typename Routine::Integer Integer;
@@ -177,7 +177,7 @@ template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 1
// Select columns manually in float space.
AlignedVector<float> selected(rows * kSelectCols);
- for (int r = 0; r < rows; ++r) {
+ for (Index r = 0; r < rows; ++r) {
for (int c = 0; c < kSelectCols; ++c) {
assert(c + r * kSelectCols < rows * kSelectCols);
selected[c + r * kSelectCols] = input[select_cols[c] + r * cols];
@@ -217,7 +217,7 @@ TEST_CASE("SelectColumnsB SSE2", "[select]") {
template <class Register> void TestMax() {
Register r = set1_ps<Register>(-2.0);
- for (int i = 0; i < sizeof(Register) / sizeof(float); ++i) {
+ for (std::size_t i = 0; i < sizeof(Register) / sizeof(float); ++i) {
Register c = r;
reinterpret_cast<float*>(&c)[i] = -1.0;
CHECK_MESSAGE((MaxFloat32(c) == -1.0), "MaxFloat32 produced " << MaxFloat32(c));
@@ -294,10 +294,10 @@ TEST_CASE("MaxAbsolute AVX512F", "[max]") {
// SOFTWARE.
// Compute A*B slowly in floats.
void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index width, Index B_cols, const float *bias=nullptr) {
- for (int r = 0; r < A_rows; ++r) {
- for (int c = 0; c < B_cols; ++c) {
+ for (Index r = 0; r < A_rows; ++r) {
+ for (Index c = 0; c < B_cols; ++c) {
float sum = 0.0f;
- for (int k = 0; k < width; ++k) {
+ for (Index k = 0; k < width; ++k) {
sum += A[r * width + k] * B[k * B_cols + c];
}
if (bias) {
@@ -311,10 +311,10 @@ void SlowRefFloat(const float *A, const float *B, float *C, Index A_rows, Index
// Compute A*B slowly from integers.
template <class Integer> void SlowRefInt(const Integer *A, const Integer *B, float *C, float unquant_mult, Index A_rows, Index width, Index B_cols, const float *bias=nullptr) {
- for (int r = 0; r < A_rows; ++r) {
- for (int c = 0; c < B_cols; ++c) {
+ for (Index r = 0; r < A_rows; ++r) {
+ for (Index c = 0; c < B_cols; ++c) {
int32_t sum = 0;
- for (int k = 0; k < width; ++k) {
+ for (Index k = 0; k < width; ++k) {
sum += static_cast<int16_t>(A[r * width + k]) * static_cast<int16_t>(B[k * B_cols + c]);
}
if (bias) {
@@ -353,10 +353,10 @@ template <class Routine, class WriteC> void TestMultiply(Index A_rows, Index wid
AlignedVector<float> B(width * B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
- for (int i = 0; i < A_rows * width; i++) {
+ for (Index i = 0; i < A_rows * width; i++) {
A.get()[i] = dist(gen);
}
- for (int i = 0; i < width * B_cols; ++i) {
+ for (Index i = 0; i < width * B_cols; ++i) {
B.get()[i] = dist(gen);
}