Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/intgemm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkpu <github@kheafield.com>2018-06-23 16:10:22 +0300
committerkpu <github@kheafield.com>2018-06-23 16:10:22 +0300
commita50bc12aa8516e17a1258f58d75d1d0db942631f (patch)
treee9370f2ed91569ee68bf8f67ccf68ab9096016d5 /test.cc
parent4fb8d5451384e736cddc6f6fd8bcf6ef9c2ce11c (diff)
Use lowercase and _ for file names
Diffstat (limited to 'test.cc')
-rw-r--r--test.cc260
1 files changed, 260 insertions, 0 deletions
diff --git a/test.cc b/test.cc
new file mode 100644
index 0000000..e111448
--- /dev/null
+++ b/test.cc
@@ -0,0 +1,260 @@
+// Based on https://arxiv.org/abs/1705.01991
+
+// Copyright (c) 2017 Microsoft Corporation
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "avx512_gemm.h"
+#include "avx2_gemm.h"
+#include "sse2_gemm.h"
+#include "aligned.h"
+#include "interleave.h"
+#include "stop_watch.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+
+#include <iostream>
+#include <iomanip>
+
+namespace intgemm {
+
+// Rearrange a tile of simd x unroll entries.
+template <class V> void SlowRearrangeTile(const V *from, V *to, int simd, int unroll, int cols) {
+ for (int i = 0; i < unroll; ++i) {
+ for (int j = 0; j < simd; ++j) {
+ to[simd * i + j] = from[cols * j + i];
+ }
+ }
+}
+
+template <class V> void SlowRearrange(const V *from, V *to, int simd, int unroll, int rows, int cols) {
+ for (int c = 0; c < cols; c += unroll) {
+ for (int r = 0; r < rows; r += simd) {
+ SlowRearrangeTile(from + cols * r + c, to, simd, unroll, cols);
+ to += unroll * simd;
+ }
+ }
+}
+
+template <class V> void SlowTranspose(const V *from, V *to, int rows, int cols) {
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ to[rows * c + r] = from[cols * r + c];
+ }
+ }
+}
+
+void TestTranspose16() {
+ free_ptr<int16_t> input(AlignedArray<int16_t>(8 * 8));
+ for (int16_t i = 0; i < 64; ++i) {
+ input.get()[i] = i;
+ }
+ free_ptr<int16_t> ref(AlignedArray<int16_t>(8 * 8));
+ SlowTranspose(input.get(), ref.get(), 8, 8);
+
+ // Overwrite input.
+ __m128i *t = reinterpret_cast<__m128i*>(input.get());
+ Transpose16InLane(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7]);
+
+ for (int16_t i = 0; i < 64; ++i) {
+ if (ref.get()[i] != input.get()[i]) {
+ std::cerr << "16-bit transpose failure at " << i << ": " << ref.get()[i] << " != " << input.get()[i] << '\n';
+ }
+ }
+}
+
+void TestTranspose8() {
+ free_ptr<int8_t> input(AlignedArray<int8_t>(16 * 16));
+ for (int i = 0; i < 16 * 16; ++i) {
+ input.get()[i] = i;
+ }
+ free_ptr<int8_t> ref(AlignedArray<int8_t>(16 * 16));
+ SlowTranspose(input.get(), ref.get(), 16, 16);
+
+ // Overwrite input.
+ __m128i *t = reinterpret_cast<__m128i*>(input.get());
+ Transpose8InLane(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12], t[13], t[14], t[15]);
+
+ for (int i = 0; i < 16 * 16; ++i) {
+ if (ref.get()[i] != input.get()[i]) {
+ std::cerr << "8-bit transpose failure at " << i << ": " << (int16_t)ref.get()[i] << " != " << (int16_t)input.get()[i] << '\n';
+ }
+ }
+}
+
+template <class T> void PrintMatrix(const T *mem, int rows, int cols) {
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ std::cout << std::setw(4) << (int64_t) mem[r * cols + c] << ' ';
+ }
+ std::cout << '\n';
+ }
+}
+
+template <class Routine> void TestPrepare(int rows = 32, int cols = 16) {
+ // Create array.
+ free_ptr<float> input(AlignedArray<float>(rows * cols));
+ for (int i = 0; i < rows * cols; ++i) {
+ input.get()[i] = //(i > 127) ? (i - 256) : i;
+ (float)rand() / (float)RAND_MAX * 256.0 - 127.0;
+ }
+
+ typedef typename Routine::Integer Integer;
+ // Call Prepare
+ free_ptr<Integer> test(AlignedArray<Integer>(rows * cols));
+ Routine::PrepareB(input.get(), test.get(), 1, rows, cols);
+
+ // Compute reference output.
+ free_ptr<Integer> quantized(AlignedArray<Integer>(rows * cols));
+ Routine::Quantize(input.get(), quantized.get(), 1, rows * cols);
+ free_ptr<Integer> reference(AlignedArray<Integer>(rows * cols));
+ SlowRearrange<Integer>(quantized.get(), reference.get(), Routine::kBTileRow, Routine::kBTileCol, rows, cols);
+
+ if (memcmp(reference.get(), test.get(), rows * cols * sizeof(Integer))) {
+ std::cerr << "TestPrepare " << Routine::Name() << " Mismatch:\n";
+ std::cout << "Quantized Input" << '\n';
+ PrintMatrix(quantized.get(), rows, cols);
+ std::cerr << "Reference" << '\n';
+ PrintMatrix(reference.get(), rows, cols);
+ std::cerr << "Routine" << '\n';
+ PrintMatrix(test.get(), rows, cols);
+ }
+}
+
+// Compute A*B slowly in floats.
+void SlowRefFloat(const float *A, const float *B, float *C, int A_rows, int width, int B_cols) {
+ for (int r = 0; r < A_rows; ++r) {
+ for (int c = 0; c < B_cols; ++c) {
+ float sum = 0.0f;
+ for (int k = 0; k < width; ++k) {
+ sum += A[r * width + k] * B[k * B_cols + c];
+ }
+ C[r * B_cols + c] = sum;
+ }
+ }
+}
+
+// Compute A*B slowly from integers.
+template <class Integer> void SlowRefInt(const Integer *A, const Integer *B, float *C, float unquant_mult, int A_rows, int width, int B_cols) {
+ for (int r = 0; r < A_rows; ++r) {
+ for (int c = 0; c < B_cols; ++c) {
+ int32_t sum = 0;
+ for (int k = 0; k < width; ++k) {
+ sum += static_cast<int16_t>(A[r * width + k]) * static_cast<int16_t>(B[k * B_cols + c]);
+ }
+ C[r * B_cols + c] = sum * unquant_mult;
+ }
+ }
+}
+
+
+void Compare(const float *float_ref, const float *int_ref, const float *int_test, std::size_t size) {
+ float int_sum = 0.0, float_sum = 0.0;
+ for (std::size_t i = 0; i < size; ++i) {
+ float int_diff = int_ref[i] - int_test[i];
+ float float_diff = float_ref[i] - int_test[i];
+/* if (fabs(int_diff) > .1 || fabs(float_diff) > 1) {
+ std::cerr << "Inaccurate at " << i << ' ' << float_ref[i] << ' ' << int_ref[i] << ' ' << int_test[i] << '\n';
+ }*/
+ int_sum += int_diff * int_diff;
+ float_sum += float_diff * float_diff;
+ }
+ std::cerr << "Float MSE = " << sqrt(float_sum / size) << "\tInt MSE = " << sqrt(int_sum / size) << std::endl;
+}
+
+template <class Routine> void TestMultiply(int A_rows, int width, int B_cols) {
+ typedef typename Routine::Integer Integer;
+ std::cout << Routine::Name() << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
+
+ // Initialize A and B.
+ free_ptr<float> A(AlignedArray<float>(A_rows * width));
+ free_ptr<float> B(AlignedArray<float>(width * B_cols));
+ for (int i = 0; i < A_rows * width; i++) {
+ A.get()[i] = ((float)rand()/(float)RAND_MAX)*2.0f - 1.0f;
+ }
+ for (int i = 0; i < width * B_cols; ++i) {
+ B.get()[i] = ((float)rand()/(float)RAND_MAX)*2.0f - 1.0f;
+ }
+
+ float quant_mult = (sizeof(Integer) == 2) ? 1024 : 64;
+ float unquant_mult = 1.0/(quant_mult*quant_mult);
+
+ free_ptr<Integer> A_prep(AlignedArray<Integer>(A_rows * width)), B_prep(AlignedArray<Integer>(width * B_cols));
+ Routine::PrepareA(A.get(), A_prep.get(), quant_mult, A_rows, width);
+ Routine::PrepareB(B.get(), B_prep.get(), quant_mult, width, B_cols);
+
+ free_ptr<float> test_C(AlignedArray<float>(A_rows * B_cols));
+ Routine::Multiply(A_prep.get(), B_prep.get(), test_C.get(), unquant_mult, A_rows, width, B_cols);
+
+ free_ptr<Integer> B_quant(AlignedArray<Integer>(width * B_cols));
+ Routine::Quantize(B.get(), B_quant.get(), quant_mult, width * B_cols);
+ free_ptr<float> slowint_C(AlignedArray<float>(A_rows * B_cols));
+ // Assuming A is just quantization here.
+ SlowRefInt(A_prep.get(), B_quant.get(), slowint_C.get(), unquant_mult, A_rows, width, B_cols);
+
+ free_ptr<float> float_C(AlignedArray<float>(A_rows * B_cols));
+ SlowRefFloat(A.get(), B.get(), float_C.get(), A_rows, width, B_cols);
+
+ Compare(float_C.get(), slowint_C.get(), test_C.get(), A_rows * B_cols);
+}
+
+void TestBoth(int A_rows, int width, int B_cols) {
+ TestMultiply<AVX512_16bit>(A_rows, width, B_cols);
+ TestMultiply<AVX2_16bit>(A_rows, width, B_cols);
+ TestMultiply<SSE2_16bit>(A_rows, width, B_cols);
+ TestMultiply<AVX512_8bit>(A_rows, width, B_cols);
+ TestMultiply<AVX2_8bit>(A_rows, width, B_cols);
+ TestMultiply<SSE2_8bit>(A_rows, width, B_cols);
+}
+
+} // namespace intgemm
+
+// Program takes no input
+int main(int argc, char ** argv) {
+ std::srand(45678);
+ using namespace intgemm;
+ TestTranspose16();
+ TestTranspose8();
+ TestPrepare<AVX512_8bit>(64, 8);
+ TestPrepare<AVX512_8bit>(256, 32);
+ TestPrepare<AVX512_16bit>(32, 8);
+ TestPrepare<AVX512_16bit>(256, 32);
+ TestPrepare<AVX2_8bit>(64, 32);
+ TestPrepare<AVX2_16bit>(64, 32);
+ TestPrepare<SSE2_16bit>(8, 8);
+ TestPrepare<SSE2_8bit>(16, 8);
+ TestPrepare<SSE2_8bit>(32, 16);
+ TestPrepare<SSE2_8bit>(32, 32);
+ // Top matrix sizes from Marian
+ TestBoth(8, 256, 256);
+ TestBoth(8, 2048, 256);
+ TestBoth(8, 2048, 256);
+ TestBoth(320, 256, 256);
+ TestBoth(472, 256, 256);
+ TestBoth(248, 256, 256);
+ TestBoth(200, 256, 256);
+ return 0;
+}
+
+