Simplify aligned allocation, reduce C++ version

author: kpu <github@kheafield.com> 2018-06-23 19:43:02 +0300
committer: kpu <github@kheafield.com> 2018-06-23 19:43:02 +0300
commit: faf1ac0293ce8c5c2b5e4eb25b785bdafd77fdd0 (patch)
tree: f03981a2b1c95e50f1dd3ab259db3e00d7ad9bee
parent: 559fb9b392bcf52e5b77c08fcd49d0f439582413 (diff)
6 files changed, 32 insertions, 43 deletions
diff --git a/Makefile b/Makefile
index 0f73af9..a6f0750 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 CXX := g++
-CXXFLAGS := -Wall -Werror -fPIC -O3 -march=native -std=c++11 -DNDEBUG
+CXXFLAGS := -Wall -Werror -fPIC -O3 -march=native -DNDEBUG
 SRC := avx512_gemm.cc avx2_gemm.cc sse2_gemm.cc stop_watch.cc dispatch.cc
 OBJ := ${SRC:.cc=.o}
 
diff --git a/aligned.h b/aligned.h
index a1a3ba1..733477a 100644
--- a/aligned.h
+++ b/aligned.h
@@ -1,38 +1,27 @@
 #pragma once
 
-// Define allocation like:
-// free_ptr<Integer> quantized(AlignedArray<Integer>(rows * cols));
+// Aligned vector of things.
 // This is only used by tests.
 
 #include <cstdlib>
-#include <memory>
 
 namespace intgemm {
 
-struct DeleteWithFree {
-  template <class T> void operator() (T *t) const {
-// This requires newer C++
-//    std::free(const_cast<std::remove_const_t<T>* >(t));
-    std::free(t);
-  }
-};
-template <class T> using free_ptr = std::unique_ptr<T, DeleteWithFree>;
-// Return memory suitably aligned for SIMD.
-template <class T> T* AlignedArray(std::size_t size) {
-  return static_cast<T*>(aligned_alloc(64, size * sizeof(T)));
-}
-
 template <class T> class AlignedVector {
   public:
-    explicit AlignedVector(std::size_t size) : mem_(AlignedArray<T>(size)) {}
+    explicit AlignedVector(std::size_t size)
+      : mem_(static_cast<T*>(aligned_alloc(64, size * sizeof(T)))) {}
+
+    ~AlignedVector() { std::free(mem_); }
+
+    T &operator[](std::size_t offset) { return mem_[offset]; }
+    const T &operator[](std::size_t offset) const { return mem_[offset]; }
 
-    T &operator[](std::size_t offset) { return mem_.get()[offset]; }
-    const T &operator[](std::size_t offset) const { return mem_.get()[offset]; }
+    T *get() { return mem_; }
+    const T *get() const { return mem_; }
 
-    T *get() { return mem_.get(); }
-    const T *get() const { return mem_.get(); }
   private:
-    free_ptr<T> mem_;
+    T *mem_;
 };
 
 } // namespace intgemm
diff --git a/dispatch.cc b/dispatch.cc
index 318242a..e4b40e0 100644
--- a/dispatch.cc
+++ b/dispatch.cc
@@ -8,7 +8,7 @@ namespace intgemm {
 
 UnsupportedCPU::UnsupportedCPU() {}
 
-UnsupportedCPU::~UnsupportedCPU() {}
+UnsupportedCPU::~UnsupportedCPU() throw() {}
 
 const char *UnsupportedCPU::what() const throw() {
   return "Integer matrix multiplication has not been efficiently implemented for your CPU.";
diff --git a/dispatch.h b/dispatch.h
index a08ad8e..f191ed3 100644
--- a/dispatch.h
+++ b/dispatch.h
@@ -50,7 +50,7 @@ class UnsupportedCPU : public std::exception {
   public:
     UnsupportedCPU();
 
-    ~UnsupportedCPU();
+    ~UnsupportedCPU() throw();
 
     const char *what() const throw();
 };
diff --git a/quantize_test.cc b/quantize_test.cc
index 36745de..21bb48b 100644
--- a/quantize_test.cc
+++ b/quantize_test.cc
@@ -42,11 +42,11 @@ template <class I> bool IsOff(float from, I ref, I test) {
 template <class Backend> bool Test(const float *input_unaligned, float quant_mult, std::size_t size) {
   typedef typename Backend::Integer Integer;
   bool success = true;
-  free_ptr<float> input(AlignedArray<float>(size));
+  AlignedVector<float> input(size);
   std::memcpy(input.get(), input_unaligned, sizeof(float) * size);
 
-  free_ptr<Integer> ref(AlignedArray<Integer>(size));
-  free_ptr<Integer> test(AlignedArray<Integer>(size));
+  AlignedVector<Integer> ref(size);
+  AlignedVector<Integer> test(size);
   QuantizeRef(input.get(), ref.get(), quant_mult, size);
   Backend::Quantize(input.get(), test.get(), quant_mult, size);
   for (std::size_t i = 0; i < size; ++i) {
diff --git a/test.cc b/test.cc
index 9ceb583..e263698 100644
--- a/test.cc
+++ b/test.cc
@@ -45,11 +45,11 @@ template <class V> void SlowTranspose(const V *from, V *to, int rows, int cols)
 }
 
 void TestTranspose16() {
-  free_ptr<int16_t> input(AlignedArray<int16_t>(8 * 8));
+  AlignedVector<int16_t> input(8 * 8);
   for (int16_t i = 0; i < 64; ++i) {
     input.get()[i] = i;
   }
-  free_ptr<int16_t> ref(AlignedArray<int16_t>(8 * 8));
+  AlignedVector<int16_t> ref(8 * 8);
   SlowTranspose(input.get(), ref.get(), 8, 8);
 
   // Overwrite input.
@@ -64,11 +64,11 @@ void TestTranspose16() {
 }
 
 void TestTranspose8() {
-  free_ptr<int8_t> input(AlignedArray<int8_t>(16 * 16));
+  AlignedVector<int8_t> input(16 * 16);
   for (int i = 0; i < 16 * 16; ++i) {
     input.get()[i] = i;
   }
-  free_ptr<int8_t> ref(AlignedArray<int8_t>(16 * 16));
+  AlignedVector<int8_t> ref(16 * 16);
   SlowTranspose(input.get(), ref.get(), 16, 16);
 
   // Overwrite input.
@@ -93,7 +93,7 @@ template <class T> void PrintMatrix(const T *mem, int rows, int cols) {
 
 template <class Routine> void TestPrepare(int rows = 32, int cols = 16) {
   // Create array.
-  free_ptr<float> input(AlignedArray<float>(rows * cols));
+  AlignedVector<float> input(rows * cols);
   for (int i = 0; i < rows * cols; ++i) {
     input.get()[i] = //(i > 127) ? (i - 256) : i;
       (float)rand() / (float)RAND_MAX * 256.0 - 127.0;
@@ -101,13 +101,13 @@ template <class Routine> void TestPrepare(int rows = 32, int cols = 16) {
 
   typedef typename Routine::Integer Integer;
   // Call Prepare
-  free_ptr<Integer> test(AlignedArray<Integer>(rows * cols));
+  AlignedVector<Integer> test(rows * cols);
   Routine::PrepareB(input.get(), test.get(), 1, rows, cols);
 
   // Compute reference output.
-  free_ptr<Integer> quantized(AlignedArray<Integer>(rows * cols));
+  AlignedVector<Integer> quantized(rows * cols);
   Routine::Quantize(input.get(), quantized.get(), 1, rows * cols);
-  free_ptr<Integer> reference(AlignedArray<Integer>(rows * cols));
+  AlignedVector<Integer> reference(rows * cols);
   SlowRearrange<Integer>(quantized.get(), reference.get(), Routine::kBTileRow, Routine::kBTileCol, rows, cols);
 
   if (memcmp(reference.get(), test.get(), rows * cols * sizeof(Integer))) {
@@ -188,8 +188,8 @@ template <class Routine> void TestMultiply(int A_rows, int width, int B_cols) {
   std::cout << Routine::Name() << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
 
   // Initialize A and B.
-  free_ptr<float> A(AlignedArray<float>(A_rows * width));
-  free_ptr<float> B(AlignedArray<float>(width * B_cols));
+  AlignedVector<float> A(A_rows * width);
+  AlignedVector<float> B(width * B_cols);
   for (int i = 0; i < A_rows * width; i++) {
     A.get()[i] = ((float)rand()/(float)RAND_MAX)*2.0f - 1.0f;
   }
@@ -200,20 +200,20 @@ template <class Routine> void TestMultiply(int A_rows, int width, int B_cols) {
   float quant_mult = (sizeof(Integer) == 2) ? 1024 : 64;
   float unquant_mult = 1.0/(quant_mult*quant_mult);
 
-  free_ptr<Integer> A_prep(AlignedArray<Integer>(A_rows * width)), B_prep(AlignedArray<Integer>(width * B_cols));
+  AlignedVector<Integer> A_prep(A_rows * width), B_prep(width * B_cols);
   Routine::PrepareA(A.get(), A_prep.get(), quant_mult, A_rows, width);
   Routine::PrepareB(B.get(), B_prep.get(), quant_mult, width, B_cols);
 
-  free_ptr<float> test_C(AlignedArray<float>(A_rows * B_cols));
+  AlignedVector<float> test_C(A_rows * B_cols);
   Routine::Multiply(A_prep.get(), B_prep.get(), test_C.get(), unquant_mult, A_rows, width, B_cols);
 
-  free_ptr<Integer> B_quant(AlignedArray<Integer>(width * B_cols));
+  AlignedVector<Integer> B_quant(width * B_cols);
   Routine::Quantize(B.get(), B_quant.get(), quant_mult, width * B_cols);
-  free_ptr<float> slowint_C(AlignedArray<float>(A_rows * B_cols));
+  AlignedVector<float> slowint_C(A_rows * B_cols);
   // Assuming A is just quantization here.
   SlowRefInt(A_prep.get(), B_quant.get(), slowint_C.get(), unquant_mult, A_rows, width, B_cols);
 
-  free_ptr<float> float_C(AlignedArray<float>(A_rows * B_cols));
+  AlignedVector<float> float_C(A_rows * B_cols);
   SlowRefFloat(A.get(), B.get(), float_C.get(), A_rows, width, B_cols);
 
   Compare(float_C.get(), slowint_C.get(), test_C.get(), A_rows * B_cols);
author	kpu <github@kheafield.com>	2018-06-23 19:43:02 +0300
committer	kpu <github@kheafield.com>	2018-06-23 19:43:02 +0300
commit	faf1ac0293ce8c5c2b5e4eb25b785bdafd77fdd0 (patch)
tree	f03981a2b1c95e50f1dd3ab259db3e00d7ad9bee
parent	559fb9b392bcf52e5b77c08fcd49d0f439582413 (diff)