diff options
author | jokeren <robinho364@gmail.com> | 2017-01-31 17:44:12 +0300 |
---|---|---|
committer | Soumith Chintala <soumith@gmail.com> | 2017-02-23 14:01:13 +0300 |
commit | e4c263784baf5e5f7d62c4be2156d7c3f900fa26 (patch) | |
tree | 5c6f68acbf315ee53ddf085171e237e0d04d11c0 | |
parent | 8a5e7595ae4e3160891c9107d057784e2282b5cc (diff) |
TH_TENSOR_APPLY2 contiguous optimization
-rw-r--r-- | lib/TH/THTensorApply.h | 191 | ||||
-rw-r--r-- | lib/TH/generic/THTensorMath.c | 30 |
2 files changed, 215 insertions, 6 deletions
diff --git a/lib/TH/THTensorApply.h b/lib/TH/THTensorApply.h index 17c1837..b88983b 100644 --- a/lib/TH/THTensorApply.h +++ b/lib/TH/THTensorApply.h @@ -255,7 +255,7 @@ for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; TENSOR1##_i--) \ { \ if(TENSOR1->stride[TENSOR1##_i] != TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) \ - TENSOR1##_dim++; \ + TENSOR1##_dim++; \ } \ TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR1##_dim)); \ TENSOR1##_sizes = TENSOR1##_counter + TENSOR1##_dim; \ @@ -271,8 +271,8 @@ TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i] * TENSOR1##_sizes[TH_TENSOR_dim_index]; \ } else { \ --TH_TENSOR_dim_index; \ - TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i]; \ - TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1##_i]; \ + TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i]; \ + TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1##_i]; \ } \ } \ TENSOR1##_size = TENSOR1##_sizes[TENSOR1##_dim-1]; \ @@ -283,7 +283,7 @@ for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; TENSOR2##_i--) \ { \ if(TENSOR2->stride[TENSOR2##_i] != TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) \ - TENSOR2##_dim++; \ + TENSOR2##_dim++; \ } \ TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR2##_dim)); \ TENSOR2##_sizes = TENSOR2##_counter + TENSOR2##_dim; \ @@ -299,8 +299,8 @@ TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i] * TENSOR2##_sizes[TH_TENSOR_dim_index]; \ } else { \ --TH_TENSOR_dim_index; \ - TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i]; \ - TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2##_i]; \ + TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i]; \ + TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2##_i]; \ } \ } \ TENSOR2##_size = TENSOR2##_sizes[TENSOR2##_dim-1]; \ @@ -380,6 +380,185 @@ THFree(TENSOR2##_counter); \ } +#define TH_TENSOR_APPLY2_CONTIGUOUS(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \ +{ \ + TYPE1 *TENSOR1##_data = NULL; \ + long *TENSOR1##_counter = NULL, *TENSOR1##_sizes = NULL, *TENSOR1##_strides = NULL; \ + long TENSOR1##_stride = 0, TENSOR1##_size = 0, TENSOR1##_dim = 0, TENSOR1##_i, TENSOR1##_n; \ + TYPE2 *TENSOR2##_data = NULL; \ + long *TENSOR2##_counter = NULL, *TENSOR2##_sizes = NULL, *TENSOR2##_strides = NULL; \ + long TENSOR2##_stride = 0, TENSOR2##_size = 0, TENSOR2##_dim = 0, TENSOR2##_i, TENSOR2##_n; \ + int TH_TENSOR_APPLY_hasFinished = 0; \ + int TH_TENSOR1_contiguous = 0, TH_TENSOR2_contiguous = 0; \ + long TH_TENSOR_dim_index = 0; \ +\ + TENSOR1##_n = (TENSOR1->nDimension ? 1 : 0); \ + for(TENSOR1##_i = 0; TENSOR1##_i < TENSOR1->nDimension; TENSOR1##_i++) \ + TENSOR1##_n *= TENSOR1->size[TENSOR1##_i]; \ +\ + TENSOR2##_n = (TENSOR2->nDimension ? 1 : 0); \ + for(TENSOR2##_i = 0; TENSOR2##_i < TENSOR2->nDimension; TENSOR2##_i++) \ + TENSOR2##_n *= TENSOR2->size[TENSOR2##_i]; \ +\ + if(TENSOR1##_n != TENSOR2##_n) /* should we do the check in the function instead? i think so */ \ + THError("inconsistent tensor size"); \ +\ + if(TENSOR1->nDimension == 0) \ + TH_TENSOR_APPLY_hasFinished = 1; \ + else \ + { \ + TENSOR1##_data = TENSOR1->storage->data+TENSOR1->storageOffset; \ + if (!THTensor_(isContiguous)(TENSOR1)) { \ + TENSOR1##_dim = 1; \ + for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; TENSOR1##_i--) \ + { \ + if(TENSOR1->stride[TENSOR1##_i] != TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) \ + TENSOR1##_dim++; \ + } \ + TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR1##_dim)); \ + TENSOR1##_sizes = TENSOR1##_counter + TENSOR1##_dim; \ + TENSOR1##_strides = TENSOR1##_counter + 2*TENSOR1##_dim; \ + TH_TENSOR_dim_index = TENSOR1##_dim-1; \ + TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1->nDimension-1]; \ + TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1->nDimension-1]; \ + for(TENSOR1##_i = TENSOR1##_dim-1; TENSOR1##_i >= 0; --TENSOR1##_i) { \ + TENSOR1##_counter[TENSOR1##_i] = 0; \ + } \ + for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; --TENSOR1##_i) { \ + if (TENSOR1->stride[TENSOR1##_i] == TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) { \ + TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i] * TENSOR1##_sizes[TH_TENSOR_dim_index]; \ + } else { \ + --TH_TENSOR_dim_index; \ + TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i]; \ + TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1##_i]; \ + } \ + } \ + TENSOR1##_size = TENSOR1##_sizes[TENSOR1##_dim-1]; \ + TENSOR1##_stride = TENSOR1##_strides[TENSOR1##_dim-1]; \ + } else { \ + TH_TENSOR1_contiguous = 1; \ + TENSOR1##_size = THTensor_(nElement)(TENSOR1); \ + TENSOR1##_stride = 1; \ + } \ +\ + TENSOR2##_data = TENSOR2->storage->data+TENSOR2->storageOffset; \ + if (!THTensor_(isContiguous)(TENSOR2)) { \ + TENSOR2##_dim = 1; \ + for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; TENSOR2##_i--) \ + { \ + if(TENSOR2->stride[TENSOR2##_i] != TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) \ + TENSOR2##_dim++; \ + } \ + TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR2##_dim)); \ + TENSOR2##_sizes = TENSOR2##_counter + TENSOR2##_dim; \ + TENSOR2##_strides = TENSOR2##_counter + 2*TENSOR2##_dim; \ + TH_TENSOR_dim_index = TENSOR2##_dim-1; \ + TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2->nDimension-1]; \ + TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2->nDimension-1]; \ + for(TENSOR2##_i = TENSOR2##_dim-1; TENSOR2##_i >= 0; --TENSOR2##_i) { \ + TENSOR2##_counter[TENSOR2##_i] = 0; \ + } \ + for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; --TENSOR2##_i) { \ + if (TENSOR2->stride[TENSOR2##_i] == TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) { \ + TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i] * TENSOR2##_sizes[TH_TENSOR_dim_index]; \ + } else { \ + --TH_TENSOR_dim_index; \ + TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i]; \ + TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2##_i]; \ + } \ + } \ + TENSOR2##_size = TENSOR2##_sizes[TENSOR2##_dim-1]; \ + TENSOR2##_stride = TENSOR2##_strides[TENSOR2##_dim-1]; \ + } else { \ + TH_TENSOR2_contiguous = 1; \ + TENSOR2##_size = THTensor_(nElement)(TENSOR2); \ + TENSOR2##_stride = 1; \ + } \ + } \ +\ + TENSOR1##_i = 0; \ + TENSOR2##_i = 0; \ + while(!TH_TENSOR_APPLY_hasFinished) \ + { \ + for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \ + { \ + CODE \ + } \ +\ + if(TENSOR1##_i == TENSOR1##_size) \ + { \ + if(TH_TENSOR1_contiguous == 1) \ + break; \ +\ + if(TENSOR1##_dim == 1) \ + break; \ +\ + TENSOR1##_data -= TENSOR1##_size*TENSOR1##_stride; \ + for(TENSOR1##_i = TENSOR1##_dim-2; TENSOR1##_i >= 0; TENSOR1##_i--) \ + { \ + TENSOR1##_counter[TENSOR1##_i]++; \ + TENSOR1##_data += TENSOR1##_strides[TENSOR1##_i]; \ +\ + if(TENSOR1##_counter[TENSOR1##_i] == TENSOR1##_sizes[TENSOR1##_i]) \ + { \ + if(TENSOR1##_i == 0) \ + { \ + TH_TENSOR_APPLY_hasFinished = 1; \ + break; \ + } \ + else \ + { \ + TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1##_strides[TENSOR1##_i]; \ + TENSOR1##_counter[TENSOR1##_i] = 0; \ + } \ + } \ + else \ + break; \ + } \ + TENSOR1##_i = 0; \ + } \ +\ + if(TENSOR2##_i == TENSOR2##_size) \ + { \ + if(TH_TENSOR2_contiguous == 1) \ + break; \ +\ + if(TENSOR2##_dim == 1) \ + break; \ +\ + TENSOR2##_data -= TENSOR2##_size*TENSOR2##_stride; \ + for(TENSOR2##_i = TENSOR2##_dim-2; TENSOR2##_i >= 0; TENSOR2##_i--) \ + { \ + TENSOR2##_counter[TENSOR2##_i]++; \ + TENSOR2##_data += TENSOR2##_strides[TENSOR2##_i]; \ +\ + if(TENSOR2##_counter[TENSOR2##_i] == TENSOR2##_sizes[TENSOR2##_i]) \ + { \ + if(TENSOR2##_i == 0) \ + { \ + TH_TENSOR_APPLY_hasFinished = 1; \ + break; \ + } \ + else \ + { \ + TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2##_strides[TENSOR2##_i]; \ + TENSOR2##_counter[TENSOR2##_i] = 0; \ + } \ + } \ + else \ + break; \ + } \ + TENSOR2##_i = 0; \ + } \ + } \ + if (!THTensor_(isContiguous)(TENSOR1)) { \ + THFree(TENSOR1##_counter); \ + } \ + if (!THTensor_(isContiguous)(TENSOR2)) { \ + THFree(TENSOR2##_counter); \ + } \ +} + /* * The basic strategy for apply is as follows: * diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c index c27fa00..302923e 100644 --- a/lib/TH/generic/THTensorMath.c +++ b/lib/TH/generic/THTensorMath.c @@ -508,6 +508,8 @@ void THTensor_(add)(THTensor *r_, THTensor *t, real value) ptrdiff_t i_end = tid == num_threads - 1 ? sz : i + sz / num_threads; THVector_(add)(rp+i, tp+i, value, i_end-i); } + } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) { + TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data + value;); } else { TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;); } @@ -538,6 +540,8 @@ void THTensor_(mul)(THTensor *r_, THTensor *t, real value) ptrdiff_t i_end = tid == num_threads - 1 ? sz : i + sz / num_threads; THVector_(mul)(rp+i, tp+i, value, i_end-i); } + } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) { + TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data * value;); } else { TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;); } @@ -563,6 +567,8 @@ void THTensor_(div)(THTensor *r_, THTensor *t, real value) ptrdiff_t i_end = tid == num_threads - 1 ? sz : i + sz / num_threads; THVector_(div)(rp+i, tp+i, value, i_end-i); } + } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) { + TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data / value;); } else { TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;); } @@ -654,6 +660,12 @@ void THTensor_(fmod)(THTensor *r_, THTensor *t, real value) rp[i] = tp[i] % value; #endif } + } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) { +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) + TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = fmod(*t_data, value);); +#else + TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = (*t_data % value);); +#endif } else { #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) TH_TENSOR_APPLY2(real, r_, real, t, *r__data = fmod(*t_data, value);); @@ -679,6 +691,13 @@ void THTensor_(remainder)(THTensor *r_, THTensor *t, real value) rp[i] = tp[i] - value * (tp[i] / value); // There is no NAN for integers #endif } + } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) { +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) + TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value);); +#else + // There is no NAN for integers + TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data - value * (*t_data / value);); +#endif } else { #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value);); @@ -770,6 +789,8 @@ void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value) #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i) for (i=0; i<sz; i++) rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]); + } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) { + TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data);); } else { TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data);); } @@ -1100,6 +1121,8 @@ void THTensor_(tpow)(THTensor *r_, real value, THTensor *t) #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i) for (i=0; i<sz; i++) rp[i] = pow(value, tp[i]); + } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) { + TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = pow(value, *t_data);); } else { TH_TENSOR_APPLY2(real, r_, real, t, *r__data = pow(value, *t_data);); } @@ -2540,6 +2563,13 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb) for (i=0; i<sz; ++i){ if(tap[i] != tbp[i]) return 0; } + } else if (THTensor_(isContiguous)(ta) || THTensor_(isContiguous)(tb)) { + // Short-circuit the apply function on inequality + TH_TENSOR_APPLY2_CONTIGUOUS(real, ta, real, tb, + if (equal && *ta_data != *tb_data) { + equal = 0; + TH_TENSOR_APPLY_hasFinished = 1; break; + }) } else { // Short-circuit the apply function on inequality TH_TENSOR_APPLY2(real, ta, real, tb, |