Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/torch7.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjokeren <robinho364@gmail.com>2016-11-30 09:36:03 +0300
committerSoumith Chintala <soumith@gmail.com>2017-02-23 13:40:24 +0300
commitece65f636ddc52a1dcefbbcf0f537b6a17c341cf (patch)
tree4a740aaaeaa07d88de3d6aa1e282ab3e0041c746
parent5b9e1859957992c052947a8cee174b57dc824425 (diff)
Add AVX single float and double float add
-rw-r--r--lib/TH/generic/THTensorMathSIMD.c84
1 files changed, 83 insertions, 1 deletions
diff --git a/lib/TH/generic/THTensorMathSIMD.c b/lib/TH/generic/THTensorMathSIMD.c
index fc68a9f..86fae50 100644
--- a/lib/TH/generic/THTensorMathSIMD.c
+++ b/lib/TH/generic/THTensorMathSIMD.c
@@ -18,14 +18,73 @@ void THTensor_(add_AVX)(THTensor *r_, THTensor *t, real value)
real *tp = THTensor_(data)(t);
ptrdiff_t sz = THTensor_(nElement)(t);
ptrdiff_t i = 0;
+ __m256d YMM15 = _mm256_set_pd(value, value, value, value);
+ __m256d YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7;
+ for (; i<=((sz)-16); i+=16) {
+ YMM0 = _mm256_loadu_pd(tp+i);
+ YMM1 = _mm256_loadu_pd(tp+i+4);
+ YMM2 = _mm256_loadu_pd(tp+i+8);
+ YMM3 = _mm256_loadu_pd(tp+i+12);
+ YMM4 = _mm256_add_pd(YMM0, YMM15);
+ YMM5 = _mm256_add_pd(YMM1, YMM15);
+ YMM6 = _mm256_add_pd(YMM2, YMM15);
+ YMM7 = _mm256_add_pd(YMM3, YMM15);
+ _mm256_storeu_pd(rp+i, YMM4);
+ _mm256_storeu_pd(rp+i+4, YMM5);
+ _mm256_storeu_pd(rp+i+8, YMM6);
+ _mm256_storeu_pd(rp+i+12, YMM7);
+ }
+ for (; i<sz; i++) {
+ rp[i] = tp[i] + value;
+ }
+ } else {
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
+ }
+}
+
+void THTensor_(mul_AVX)(THTensor *r_, THTensor *t, real value)
+{
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ real *tp = THTensor_(data)(t);
+ real *rp = THTensor_(data)(r_);
+ ptrdiff_t sz = THTensor_(nElement)(t);
+ ptrdiff_t i;
__m256d YMM3 = _mm256_set_pd(value, value, value, value);
__m256d YMM0, YMM2;
for (; i<=((sz)-4); i+=4) {
YMM0 = _mm256_loadu_pd(tp+i);
- YMM2 = _mm256_add_pd(YMM0, YMM3);
+ YMM2 = _mm256_mul_pd(YMM0, YMM3);
_mm256_storeu_pd(rp+i, YMM2);
}
for (; i<sz; i++) {
+ rp[i] = tp[i] * value;
+ }
+ } else {
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
+ }
+}
+
+#endif
+
+#if defined(TH_REAL_IS_FLOAT)
+
+void THTensor_(add_AVX)(THTensor *r_, THTensor *t, real value)
+{
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ real *rp = THTensor_(data)(r_);
+ real *tp = THTensor_(data)(t);
+ ptrdiff_t sz = THTensor_(nElement)(t);
+ ptrdiff_t i = 0;
+ __m256 YMM3 = _mm256_set_ps(value, value, value, value, value, value, value, value);
+ __m256 YMM0, YMM2;
+ for (; i<=((sz)-8); i+=8) {
+ YMM0 = _mm256_loadu_ps(tp+i);
+ YMM2 = _mm256_add_ps(YMM0, YMM3);
+ _mm256_storeu_ps(rp+i, YMM2);
+ }
+ for (; i<sz; i++) {
rp[i] = tp[i] + value;
}
} else {
@@ -33,6 +92,29 @@ void THTensor_(add_AVX)(THTensor *r_, THTensor *t, real value)
}
}
+void THTensor_(mul_AVX)(THTensor *r_, THTensor *t, real value)
+{
+ THTensor_(resizeAs)(r_, t);
+ if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
+ real *tp = THTensor_(data)(t);
+ real *rp = THTensor_(data)(r_);
+ ptrdiff_t sz = THTensor_(nElement)(t);
+ ptrdiff_t i;
+ __m256 YMM3 = _mm256_set_ps(value, value, value, value, value, value, value, value);
+ __m256 YMM0, YMM2;
+ for (; i<=((sz)-8); i+=8) {
+ YMM0 = _mm256_loadu_ps(tp+i);
+ YMM2 = _mm256_mul_ps(YMM0, YMM3);
+ _mm256_storeu_ps(rp+i, YMM2);
+ }
+ for (; i<sz; i++) {
+ rp[i] = tp[i] * value;
+ }
+ } else {
+ TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
+ }
+}
+
#endif
#endif