diff options
author | jokeren <robinho364@gmail.com> | 2016-12-05 08:34:35 +0300 |
---|---|---|
committer | Soumith Chintala <soumith@gmail.com> | 2017-02-23 13:40:33 +0300 |
commit | 15cccf3cad04862c8c25c710b5109b6d650d5085 (patch) | |
tree | 65b2ca8c12c9450e47354cfde29a92c0f4e206fe | |
parent | 1ad347c00ed2224ff13fb93ad0d45e5e709b8310 (diff) |
Merge THVector cmul
-rw-r--r-- | lib/TH/generic/THTensorMath.c | 17 | ||||
-rw-r--r-- | lib/TH/generic/THVector.h | 2 | ||||
-rw-r--r-- | lib/TH/generic/THVectorDefault.c | 12 | ||||
-rw-r--r-- | lib/TH/generic/THVectorDispatch.c | 7 | ||||
-rw-r--r-- | lib/TH/vector/NEON.c | 12 | ||||
-rw-r--r-- | lib/TH/vector/SSE.c | 30 |
6 files changed, 43 insertions, 37 deletions
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c index 54b43c8..9f0493f 100644 --- a/lib/TH/generic/THTensorMath.c +++ b/lib/TH/generic/THTensorMath.c @@ -776,10 +776,19 @@ void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src) real *sp = THTensor_(data)(src); real *rp = THTensor_(data)(r_); ptrdiff_t sz = THTensor_(nElement)(t); - ptrdiff_t i; - #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i) - for (i=0; i<sz; i++) - rp[i] = tp[i] * sp[i]; + #pragma omp parallel if(sz > TH_OMP_OVERHEAD_THRESHOLD) + { + #ifdef _OPENMP + size_t num_threads = omp_get_num_threads(); + size_t tid = omp_get_thread_num(); + #else + size_t num_threads = 1; + size_t tid = 0; + #endif + ptrdiff_t i = tid * (sz / num_threads); + ptrdiff_t i_end = tid == num_threads - 1 ? sz : i + sz / num_threads; + THVector_(cmul)(rp+i, tp+i, sp+i, i_end-i); + } } else { TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * *src_data;); } diff --git a/lib/TH/generic/THVector.h b/lib/TH/generic/THVector.h index 3edb83f..271868f 100644 --- a/lib/TH/generic/THVector.h +++ b/lib/TH/generic/THVector.h @@ -7,7 +7,7 @@ TH_API void THVector_(cadd)(real *z, const real *x, const real *y, const real c, TH_API void THVector_(add)(real *y, const real *x, const real c, const ptrdiff_t n); TH_API void THVector_(diff)(real *z, const real *x, const real *y, const ptrdiff_t n); TH_API void THVector_(scale)(real *y, const real c, const ptrdiff_t n); -TH_API void THVector_(cmul)(real *y, const real *x, const ptrdiff_t n); +TH_API void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n); /* Initialize the dispatch pointers */ TH_API void THVector_(vectorDispatchInit)(void); diff --git a/lib/TH/generic/THVectorDefault.c b/lib/TH/generic/THVectorDefault.c index 89e007f..2603ed0 100644 --- a/lib/TH/generic/THVectorDefault.c +++ b/lib/TH/generic/THVectorDefault.c @@ -81,20 +81,20 @@ void THVector_(scale_DEFAULT)(real *y, const real c, const ptrdiff_t n) y[i] *= c; } -void THVector_(cmul_DEFAULT)(real *y, const real *x, const ptrdiff_t n) +void THVector_(cmul_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n) { ptrdiff_t i = 0; for(; i < n-4; i += 4) { - y[i] *= x[i]; - y[i+1] *= x[i+1]; - y[i+2] *= x[i+2]; - y[i+3] *= x[i+3]; + z[i] = x[i] * y[i]; + z[i+1] = x[i+1] * y[i+1]; + z[i+2] = x[i+2] * y[i+2]; + z[i+3] = x[i+3] * y[i+3]; } for(; i < n; i++) - y[i] *= x[i]; + z[i] = x[i] * y[i]; } #endif diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c index 9f70e75..bc87d78 100644 --- a/lib/TH/generic/THVectorDispatch.c +++ b/lib/TH/generic/THVectorDispatch.c @@ -107,7 +107,6 @@ void THVector_(diff)(real *z, const real *x, const real *y, const ptrdiff_t n) { THVector_(diff_DISPATCHPTR)(z, x, y, n); } - static void (*THVector_(scale_DISPATCHPTR))(real *, const real, const ptrdiff_t) = &THVector_(scale_DEFAULT); static FunctionDescription THVector_(scale_DISPATCHTABLE)[] = { #if defined(__NEON__) @@ -136,7 +135,7 @@ TH_API void THVector_(scale)(real *y, const real c, const ptrdiff_t n) { } -static void (*THVector_(cmul_DISPATCHPTR))(real *, const real *, const ptrdiff_t) = &THVector_(cmul_DEFAULT); +static void (*THVector_(cmul_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(cmul_DEFAULT); static FunctionDescription THVector_(cmul_DISPATCHTABLE)[] = { #if defined(__NEON__) #if defined(TH_REAL_IS_FLOAT) @@ -159,8 +158,8 @@ static FunctionDescription THVector_(cmul_DISPATCHTABLE)[] = { FUNCTION_IMPL(THVector_(cmul_DEFAULT), SIMDExtension_DEFAULT) }; -void THVector_(cmul)(real *y, const real *x, const ptrdiff_t n) { - THVector_(cmul_DISPATCHPTR); +void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n) { + THVector_(cmul_DISPATCHPTR)(z, x, y, n); } /* This needs to be called in order to initialize the dispatch pointers at runtime. diff --git a/lib/TH/vector/NEON.c b/lib/TH/vector/NEON.c index 94a3907..3f2aaf2 100644 --- a/lib/TH/vector/NEON.c +++ b/lib/TH/vector/NEON.c @@ -47,19 +47,19 @@ static void THFloatVector_scale_NEON(float *y, const float c, const ptrdiff_t n) y[i] *= c; } -static void THFloatVector_cmul_NEON(float *y, const float *x, const ptrdiff_t n) { +static void THFloatVector_cmul_NEON(float *z, const float *x, const float* y, const ptrdiff_t n) { long i = 0; for(; i < n-4; i += 4) { - y[i] *= x[i]; - y[i+1] *= x[i+1]; - y[i+2] *= x[i+2]; - y[i+3] *= x[i+3]; + z[i] = x[i] * y[i]; + z[i+1] = x[i+1] * y[i+1]; + z[i+2] = x[i+2] * y[i+2]; + z[i+3] = x[i+3] * y[i+3]; } for(; i < n; i++) - y[i] *= x[i]; + z[i] = x[i] * y[i]; } static void THFloatVector_cadd_NEON(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { diff --git a/lib/TH/vector/SSE.c b/lib/TH/vector/SSE.c index 4735e63..f253aef 100644 --- a/lib/TH/vector/SSE.c +++ b/lib/TH/vector/SSE.c @@ -84,7 +84,7 @@ static void THDoubleVector_scale_SSE(double *y, const double c, const ptrdiff_t } -static void THDoubleVector_cmul_SSE(double *y, const double *x, const ptrdiff_t n) { +static void THDoubleVector_cmul_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) { ptrdiff_t i; for (i=0; i<=((n)-8); i+=8) { __m128d XMM0 = _mm_loadu_pd((x)+i ); @@ -99,14 +99,13 @@ static void THDoubleVector_cmul_SSE(double *y, const double *x, const ptrdiff_t XMM5 = _mm_mul_pd(XMM5, XMM1); XMM6 = _mm_mul_pd(XMM6, XMM2); XMM7 = _mm_mul_pd(XMM7, XMM3); - _mm_storeu_pd((y)+i , XMM4); - _mm_storeu_pd((y)+i+2, XMM5); - _mm_storeu_pd((y)+i+4, XMM6); - _mm_storeu_pd((y)+i+6, XMM7); + _mm_storeu_pd((z)+i , XMM4); + _mm_storeu_pd((z)+i+2, XMM5); + _mm_storeu_pd((z)+i+4, XMM6); + _mm_storeu_pd((z)+i+6, XMM7); } - ptrdiff_t off = (n) - ((n)%8); - for (i=0; i<((n)%8); i++) { - y[off+i] *= x[off+i]; + for (; i<(n); i++) { + z[i] = x[i] * y[i]; } } @@ -189,7 +188,7 @@ static void THFloatVector_scale_SSE(float *y, const float c, const ptrdiff_t n) } } -static void THFloatVector_cmul_SSE(float *y, const float *x, const ptrdiff_t n) { +static void THFloatVector_cmul_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) { ptrdiff_t i; for (i=0; i<=((n)-16); i+=16) { __m128 XMM0 = _mm_loadu_ps((x)+i ); @@ -204,14 +203,13 @@ static void THFloatVector_cmul_SSE(float *y, const float *x, const ptrdiff_t n) XMM5 = _mm_mul_ps(XMM5, XMM1); XMM6 = _mm_mul_ps(XMM6, XMM2); XMM7 = _mm_mul_ps(XMM7, XMM3); - _mm_storeu_ps((y)+i , XMM4); - _mm_storeu_ps((y)+i+ 4, XMM5); - _mm_storeu_ps((y)+i+ 8, XMM6); - _mm_storeu_ps((y)+i+12, XMM7); + _mm_storeu_ps((z)+i , XMM4); + _mm_storeu_ps((z)+i+ 4, XMM5); + _mm_storeu_ps((z)+i+ 8, XMM6); + _mm_storeu_ps((z)+i+12, XMM7); } - ptrdiff_t off = (n) - ((n)%16); - for (i=0; i<((n)%16); i++) { - y[off+i] *= x[off+i]; + for (; i<(n); i++) { + z[i] = x[i] * y[i]; } } |