Add AVX single float and double float add

author: jokeren <robinho364@gmail.com> 2016-11-30 09:36:03 +0300
committer: Soumith Chintala <soumith@gmail.com> 2017-02-23 13:40:24 +0300
commit: ece65f636ddc52a1dcefbbcf0f537b6a17c341cf (patch)
tree: 4a740aaaeaa07d88de3d6aa1e282ab3e0041c746
parent: 5b9e1859957992c052947a8cee174b57dc824425 (diff)
1 files changed, 83 insertions, 1 deletions
diff --git a/lib/TH/generic/THTensorMathSIMD.c b/lib/TH/generic/THTensorMathSIMD.c
index fc68a9f..86fae50 100644
--- a/lib/TH/generic/THTensorMathSIMD.c
+++ b/lib/TH/generic/THTensorMathSIMD.c
@@ -18,14 +18,73 @@ void THTensor_(add_AVX)(THTensor *r_, THTensor *t, real value)
     real *tp = THTensor_(data)(t);
     ptrdiff_t sz = THTensor_(nElement)(t);
     ptrdiff_t i = 0;
+    __m256d YMM15 = _mm256_set_pd(value, value, value, value);
+    __m256d YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7;
+    for (; i<=((sz)-16); i+=16) {
+      YMM0 = _mm256_loadu_pd(tp+i);
+      YMM1 = _mm256_loadu_pd(tp+i+4);
+      YMM2 = _mm256_loadu_pd(tp+i+8);
+      YMM3 = _mm256_loadu_pd(tp+i+12);
+      YMM4 = _mm256_add_pd(YMM0, YMM15);
+      YMM5 = _mm256_add_pd(YMM1, YMM15);
+      YMM6 = _mm256_add_pd(YMM2, YMM15);
+      YMM7 = _mm256_add_pd(YMM3, YMM15);
+      _mm256_storeu_pd(rp+i, YMM4);
+      _mm256_storeu_pd(rp+i+4, YMM5);
+      _mm256_storeu_pd(rp+i+8, YMM6);
+      _mm256_storeu_pd(rp+i+12, YMM7);
+    }
+    for (; i<sz; i++) {
+      rp[i] = tp[i] + value;
+    }
+  } else {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
+  }
+}
+
+void THTensor_(mul_AVX)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    ptrdiff_t sz = THTensor_(nElement)(t);
+    ptrdiff_t i;
     __m256d YMM3 = _mm256_set_pd(value, value, value, value);
     __m256d YMM0, YMM2;
     for (; i<=((sz)-4); i+=4) {
       YMM0 = _mm256_loadu_pd(tp+i);
-      YMM2 = _mm256_add_pd(YMM0, YMM3);
+      YMM2 = _mm256_mul_pd(YMM0, YMM3);
       _mm256_storeu_pd(rp+i, YMM2);
     }
     for (; i<sz; i++) {
+      rp[i] = tp[i] * value;
+    }
+  } else {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
+  }
+}
+
+#endif
+
+#if defined(TH_REAL_IS_FLOAT)
+
+void THTensor_(add_AVX)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+    real *rp = THTensor_(data)(r_);
+    real *tp = THTensor_(data)(t);
+    ptrdiff_t sz = THTensor_(nElement)(t);
+    ptrdiff_t i = 0;
+    __m256 YMM3 = _mm256_set_ps(value, value, value, value, value, value, value, value);
+    __m256 YMM0, YMM2;
+    for (; i<=((sz)-8); i+=8) {
+      YMM0 = _mm256_loadu_ps(tp+i);
+      YMM2 = _mm256_add_ps(YMM0, YMM3);
+      _mm256_storeu_ps(rp+i, YMM2);
+    }
+    for (; i<sz; i++) {
       rp[i] = tp[i] + value;
     }
   } else {
@@ -33,6 +92,29 @@ void THTensor_(add_AVX)(THTensor *r_, THTensor *t, real value)
   }
 }
 
+void THTensor_(mul_AVX)(THTensor *r_, THTensor *t, real value)
+{
+  THTensor_(resizeAs)(r_, t);
+  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+    real *tp = THTensor_(data)(t);
+    real *rp = THTensor_(data)(r_);
+    ptrdiff_t sz = THTensor_(nElement)(t);
+    ptrdiff_t i;
+    __m256 YMM3 = _mm256_set_ps(value, value, value, value, value, value, value, value);
+    __m256 YMM0, YMM2;
+    for (; i<=((sz)-8); i+=8) {
+      YMM0 = _mm256_loadu_ps(tp+i);
+      YMM2 = _mm256_mul_ps(YMM0, YMM3);
+      _mm256_storeu_ps(rp+i, YMM2);
+    }
+    for (; i<sz; i++) {
+      rp[i] = tp[i] * value;
+    }
+  } else {
+    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
+  }
+}
+
 #endif
 
 #endif
author	jokeren <robinho364@gmail.com>	2016-11-30 09:36:03 +0300
committer	Soumith Chintala <soumith@gmail.com>	2017-02-23 13:40:24 +0300
commit	ece65f636ddc52a1dcefbbcf0f537b6a17c341cf (patch)
tree	4a740aaaeaa07d88de3d6aa1e282ab3e0041c746
parent	5b9e1859957992c052947a8cee174b57dc824425 (diff)