diff options
author | jokeren <robinho364@gmail.com> | 2016-11-30 09:36:03 +0300 |
---|---|---|
committer | Soumith Chintala <soumith@gmail.com> | 2017-02-23 13:40:24 +0300 |
commit | ece65f636ddc52a1dcefbbcf0f537b6a17c341cf (patch) | |
tree | 4a740aaaeaa07d88de3d6aa1e282ab3e0041c746 | |
parent | 5b9e1859957992c052947a8cee174b57dc824425 (diff) |
Add AVX single float and double float add
-rw-r--r-- | lib/TH/generic/THTensorMathSIMD.c | 84 |
1 files changed, 83 insertions, 1 deletions
diff --git a/lib/TH/generic/THTensorMathSIMD.c b/lib/TH/generic/THTensorMathSIMD.c index fc68a9f..86fae50 100644 --- a/lib/TH/generic/THTensorMathSIMD.c +++ b/lib/TH/generic/THTensorMathSIMD.c @@ -18,14 +18,73 @@ void THTensor_(add_AVX)(THTensor *r_, THTensor *t, real value) real *tp = THTensor_(data)(t); ptrdiff_t sz = THTensor_(nElement)(t); ptrdiff_t i = 0; + __m256d YMM15 = _mm256_set_pd(value, value, value, value); + __m256d YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7; + for (; i<=((sz)-16); i+=16) { + YMM0 = _mm256_loadu_pd(tp+i); + YMM1 = _mm256_loadu_pd(tp+i+4); + YMM2 = _mm256_loadu_pd(tp+i+8); + YMM3 = _mm256_loadu_pd(tp+i+12); + YMM4 = _mm256_add_pd(YMM0, YMM15); + YMM5 = _mm256_add_pd(YMM1, YMM15); + YMM6 = _mm256_add_pd(YMM2, YMM15); + YMM7 = _mm256_add_pd(YMM3, YMM15); + _mm256_storeu_pd(rp+i, YMM4); + _mm256_storeu_pd(rp+i+4, YMM5); + _mm256_storeu_pd(rp+i+8, YMM6); + _mm256_storeu_pd(rp+i+12, YMM7); + } + for (; i<sz; i++) { + rp[i] = tp[i] + value; + } + } else { + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;); + } +} + +void THTensor_(mul_AVX)(THTensor *r_, THTensor *t, real value) +{ + THTensor_(resizeAs)(r_, t); + if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) { + real *tp = THTensor_(data)(t); + real *rp = THTensor_(data)(r_); + ptrdiff_t sz = THTensor_(nElement)(t); + ptrdiff_t i; __m256d YMM3 = _mm256_set_pd(value, value, value, value); __m256d YMM0, YMM2; for (; i<=((sz)-4); i+=4) { YMM0 = _mm256_loadu_pd(tp+i); - YMM2 = _mm256_add_pd(YMM0, YMM3); + YMM2 = _mm256_mul_pd(YMM0, YMM3); _mm256_storeu_pd(rp+i, YMM2); } for (; i<sz; i++) { + rp[i] = tp[i] * value; + } + } else { + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;); + } +} + +#endif + +#if defined(TH_REAL_IS_FLOAT) + +void THTensor_(add_AVX)(THTensor *r_, THTensor *t, real value) +{ + THTensor_(resizeAs)(r_, t); + if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) { + real *rp = THTensor_(data)(r_); + real *tp = THTensor_(data)(t); + ptrdiff_t sz = THTensor_(nElement)(t); + ptrdiff_t i = 0; + __m256 YMM3 = _mm256_set_ps(value, value, value, value, value, value, value, value); + __m256 YMM0, YMM2; + for (; i<=((sz)-8); i+=8) { + YMM0 = _mm256_loadu_ps(tp+i); + YMM2 = _mm256_add_ps(YMM0, YMM3); + _mm256_storeu_ps(rp+i, YMM2); + } + for (; i<sz; i++) { rp[i] = tp[i] + value; } } else { @@ -33,6 +92,29 @@ void THTensor_(add_AVX)(THTensor *r_, THTensor *t, real value) } } +void THTensor_(mul_AVX)(THTensor *r_, THTensor *t, real value) +{ + THTensor_(resizeAs)(r_, t); + if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) { + real *tp = THTensor_(data)(t); + real *rp = THTensor_(data)(r_); + ptrdiff_t sz = THTensor_(nElement)(t); + ptrdiff_t i; + __m256 YMM3 = _mm256_set_ps(value, value, value, value, value, value, value, value); + __m256 YMM0, YMM2; + for (; i<=((sz)-8); i+=8) { + YMM0 = _mm256_loadu_ps(tp+i); + YMM2 = _mm256_mul_ps(YMM0, YMM3); + _mm256_storeu_ps(rp+i, YMM2); + } + for (; i<sz; i++) { + rp[i] = tp[i] * value; + } + } else { + TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;); + } +} + #endif #endif |