Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/torch7.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjokeren <robinho364@gmail.com>2017-01-31 17:44:12 +0300
committerSoumith Chintala <soumith@gmail.com>2017-02-23 14:01:13 +0300
commite4c263784baf5e5f7d62c4be2156d7c3f900fa26 (patch)
tree5c6f68acbf315ee53ddf085171e237e0d04d11c0
parent8a5e7595ae4e3160891c9107d057784e2282b5cc (diff)
TH_TENSOR_APPLY2 contiguous optimization
-rw-r--r--lib/TH/THTensorApply.h191
-rw-r--r--lib/TH/generic/THTensorMath.c30
2 files changed, 215 insertions, 6 deletions
diff --git a/lib/TH/THTensorApply.h b/lib/TH/THTensorApply.h
index 17c1837..b88983b 100644
--- a/lib/TH/THTensorApply.h
+++ b/lib/TH/THTensorApply.h
@@ -255,7 +255,7 @@
for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; TENSOR1##_i--) \
{ \
if(TENSOR1->stride[TENSOR1##_i] != TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) \
- TENSOR1##_dim++; \
+ TENSOR1##_dim++; \
} \
TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR1##_dim)); \
TENSOR1##_sizes = TENSOR1##_counter + TENSOR1##_dim; \
@@ -271,8 +271,8 @@
TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i] * TENSOR1##_sizes[TH_TENSOR_dim_index]; \
} else { \
--TH_TENSOR_dim_index; \
- TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i]; \
- TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1##_i]; \
+ TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i]; \
+ TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1##_i]; \
} \
} \
TENSOR1##_size = TENSOR1##_sizes[TENSOR1##_dim-1]; \
@@ -283,7 +283,7 @@
for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; TENSOR2##_i--) \
{ \
if(TENSOR2->stride[TENSOR2##_i] != TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) \
- TENSOR2##_dim++; \
+ TENSOR2##_dim++; \
} \
TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR2##_dim)); \
TENSOR2##_sizes = TENSOR2##_counter + TENSOR2##_dim; \
@@ -299,8 +299,8 @@
TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i] * TENSOR2##_sizes[TH_TENSOR_dim_index]; \
} else { \
--TH_TENSOR_dim_index; \
- TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i]; \
- TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2##_i]; \
+ TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i]; \
+ TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2##_i]; \
} \
} \
TENSOR2##_size = TENSOR2##_sizes[TENSOR2##_dim-1]; \
@@ -380,6 +380,185 @@
THFree(TENSOR2##_counter); \
}
+#define TH_TENSOR_APPLY2_CONTIGUOUS(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+{ \
+ TYPE1 *TENSOR1##_data = NULL; \
+ long *TENSOR1##_counter = NULL, *TENSOR1##_sizes = NULL, *TENSOR1##_strides = NULL; \
+ long TENSOR1##_stride = 0, TENSOR1##_size = 0, TENSOR1##_dim = 0, TENSOR1##_i, TENSOR1##_n; \
+ TYPE2 *TENSOR2##_data = NULL; \
+ long *TENSOR2##_counter = NULL, *TENSOR2##_sizes = NULL, *TENSOR2##_strides = NULL; \
+ long TENSOR2##_stride = 0, TENSOR2##_size = 0, TENSOR2##_dim = 0, TENSOR2##_i, TENSOR2##_n; \
+ int TH_TENSOR_APPLY_hasFinished = 0; \
+ int TH_TENSOR1_contiguous = 0, TH_TENSOR2_contiguous = 0; \
+ long TH_TENSOR_dim_index = 0; \
+\
+ TENSOR1##_n = (TENSOR1->nDimension ? 1 : 0); \
+ for(TENSOR1##_i = 0; TENSOR1##_i < TENSOR1->nDimension; TENSOR1##_i++) \
+ TENSOR1##_n *= TENSOR1->size[TENSOR1##_i]; \
+\
+ TENSOR2##_n = (TENSOR2->nDimension ? 1 : 0); \
+ for(TENSOR2##_i = 0; TENSOR2##_i < TENSOR2->nDimension; TENSOR2##_i++) \
+ TENSOR2##_n *= TENSOR2->size[TENSOR2##_i]; \
+\
+ if(TENSOR1##_n != TENSOR2##_n) /* should we do the check in the function instead? i think so */ \
+ THError("inconsistent tensor size"); \
+\
+ if(TENSOR1->nDimension == 0) \
+ TH_TENSOR_APPLY_hasFinished = 1; \
+ else \
+ { \
+ TENSOR1##_data = TENSOR1->storage->data+TENSOR1->storageOffset; \
+ if (!THTensor_(isContiguous)(TENSOR1)) { \
+ TENSOR1##_dim = 1; \
+ for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; TENSOR1##_i--) \
+ { \
+ if(TENSOR1->stride[TENSOR1##_i] != TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) \
+ TENSOR1##_dim++; \
+ } \
+ TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR1##_dim)); \
+ TENSOR1##_sizes = TENSOR1##_counter + TENSOR1##_dim; \
+ TENSOR1##_strides = TENSOR1##_counter + 2*TENSOR1##_dim; \
+ TH_TENSOR_dim_index = TENSOR1##_dim-1; \
+ TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1->nDimension-1]; \
+ TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1->nDimension-1]; \
+ for(TENSOR1##_i = TENSOR1##_dim-1; TENSOR1##_i >= 0; --TENSOR1##_i) { \
+ TENSOR1##_counter[TENSOR1##_i] = 0; \
+ } \
+ for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; --TENSOR1##_i) { \
+ if (TENSOR1->stride[TENSOR1##_i] == TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) { \
+ TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i] * TENSOR1##_sizes[TH_TENSOR_dim_index]; \
+ } else { \
+ --TH_TENSOR_dim_index; \
+ TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i]; \
+ TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1##_i]; \
+ } \
+ } \
+ TENSOR1##_size = TENSOR1##_sizes[TENSOR1##_dim-1]; \
+ TENSOR1##_stride = TENSOR1##_strides[TENSOR1##_dim-1]; \
+ } else { \
+ TH_TENSOR1_contiguous = 1; \
+ TENSOR1##_size = THTensor_(nElement)(TENSOR1); \
+ TENSOR1##_stride = 1; \
+ } \
+\
+ TENSOR2##_data = TENSOR2->storage->data+TENSOR2->storageOffset; \
+ if (!THTensor_(isContiguous)(TENSOR2)) { \
+ TENSOR2##_dim = 1; \
+ for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; TENSOR2##_i--) \
+ { \
+ if(TENSOR2->stride[TENSOR2##_i] != TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) \
+ TENSOR2##_dim++; \
+ } \
+ TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR2##_dim)); \
+ TENSOR2##_sizes = TENSOR2##_counter + TENSOR2##_dim; \
+ TENSOR2##_strides = TENSOR2##_counter + 2*TENSOR2##_dim; \
+ TH_TENSOR_dim_index = TENSOR2##_dim-1; \
+ TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2->nDimension-1]; \
+ TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2->nDimension-1]; \
+ for(TENSOR2##_i = TENSOR2##_dim-1; TENSOR2##_i >= 0; --TENSOR2##_i) { \
+ TENSOR2##_counter[TENSOR2##_i] = 0; \
+ } \
+ for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; --TENSOR2##_i) { \
+ if (TENSOR2->stride[TENSOR2##_i] == TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) { \
+ TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i] * TENSOR2##_sizes[TH_TENSOR_dim_index]; \
+ } else { \
+ --TH_TENSOR_dim_index; \
+ TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i]; \
+ TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2##_i]; \
+ } \
+ } \
+ TENSOR2##_size = TENSOR2##_sizes[TENSOR2##_dim-1]; \
+ TENSOR2##_stride = TENSOR2##_strides[TENSOR2##_dim-1]; \
+ } else { \
+ TH_TENSOR2_contiguous = 1; \
+ TENSOR2##_size = THTensor_(nElement)(TENSOR2); \
+ TENSOR2##_stride = 1; \
+ } \
+ } \
+\
+ TENSOR1##_i = 0; \
+ TENSOR2##_i = 0; \
+ while(!TH_TENSOR_APPLY_hasFinished) \
+ { \
+ for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
+ { \
+ CODE \
+ } \
+\
+ if(TENSOR1##_i == TENSOR1##_size) \
+ { \
+ if(TH_TENSOR1_contiguous == 1) \
+ break; \
+\
+ if(TENSOR1##_dim == 1) \
+ break; \
+\
+ TENSOR1##_data -= TENSOR1##_size*TENSOR1##_stride; \
+ for(TENSOR1##_i = TENSOR1##_dim-2; TENSOR1##_i >= 0; TENSOR1##_i--) \
+ { \
+ TENSOR1##_counter[TENSOR1##_i]++; \
+ TENSOR1##_data += TENSOR1##_strides[TENSOR1##_i]; \
+\
+ if(TENSOR1##_counter[TENSOR1##_i] == TENSOR1##_sizes[TENSOR1##_i]) \
+ { \
+ if(TENSOR1##_i == 0) \
+ { \
+ TH_TENSOR_APPLY_hasFinished = 1; \
+ break; \
+ } \
+ else \
+ { \
+ TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1##_strides[TENSOR1##_i]; \
+ TENSOR1##_counter[TENSOR1##_i] = 0; \
+ } \
+ } \
+ else \
+ break; \
+ } \
+ TENSOR1##_i = 0; \
+ } \
+\
+ if(TENSOR2##_i == TENSOR2##_size) \
+ { \
+ if(TH_TENSOR2_contiguous == 1) \
+ break; \
+\
+ if(TENSOR2##_dim == 1) \
+ break; \
+\
+ TENSOR2##_data -= TENSOR2##_size*TENSOR2##_stride; \
+ for(TENSOR2##_i = TENSOR2##_dim-2; TENSOR2##_i >= 0; TENSOR2##_i--) \
+ { \
+ TENSOR2##_counter[TENSOR2##_i]++; \
+ TENSOR2##_data += TENSOR2##_strides[TENSOR2##_i]; \
+\
+ if(TENSOR2##_counter[TENSOR2##_i] == TENSOR2##_sizes[TENSOR2##_i]) \
+ { \
+ if(TENSOR2##_i == 0) \
+ { \
+ TH_TENSOR_APPLY_hasFinished = 1; \
+ break; \
+ } \
+ else \
+ { \
+ TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2##_strides[TENSOR2##_i]; \
+ TENSOR2##_counter[TENSOR2##_i] = 0; \
+ } \
+ } \
+ else \
+ break; \
+ } \
+ TENSOR2##_i = 0; \
+ } \
+ } \
+ if (!THTensor_(isContiguous)(TENSOR1)) { \
+ THFree(TENSOR1##_counter); \
+ } \
+ if (!THTensor_(isContiguous)(TENSOR2)) { \
+ THFree(TENSOR2##_counter); \
+ } \
+}
+
/*
* The basic strategy for apply is as follows:
*
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index c27fa00..302923e 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -508,6 +508,8 @@ void THTensor_(add)(THTensor *r_, THTensor *t, real value)
ptrdiff_t i_end = tid == num_threads - 1 ? sz : i + sz / num_threads;
THVector_(add)(rp+i, tp+i, value, i_end-i);
}
+ } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+ TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data + value;);
} else {
TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
}
@@ -538,6 +540,8 @@ void THTensor_(mul)(THTensor *r_, THTensor *t, real value)
ptrdiff_t i_end = tid == num_threads - 1 ? sz : i + sz / num_threads;
THVector_(mul)(rp+i, tp+i, value, i_end-i);
}
+ } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+ TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data * value;);
} else {
TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
}
@@ -563,6 +567,8 @@ void THTensor_(div)(THTensor *r_, THTensor *t, real value)
ptrdiff_t i_end = tid == num_threads - 1 ? sz : i + sz / num_threads;
THVector_(div)(rp+i, tp+i, value, i_end-i);
}
+ } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+ TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data / value;);
} else {
TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;);
}
@@ -654,6 +660,12 @@ void THTensor_(fmod)(THTensor *r_, THTensor *t, real value)
rp[i] = tp[i] % value;
#endif
}
+ } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+ TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = fmod(*t_data, value););
+#else
+ TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = (*t_data % value););
+#endif
} else {
#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
TH_TENSOR_APPLY2(real, r_, real, t, *r__data = fmod(*t_data, value););
@@ -679,6 +691,13 @@ void THTensor_(remainder)(THTensor *r_, THTensor *t, real value)
rp[i] = tp[i] - value * (tp[i] / value); // There is no NAN for integers
#endif
}
+ } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+ TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value););
+#else
+ // There is no NAN for integers
+ TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data - value * (*t_data / value););
+#endif
} else {
#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value););
@@ -770,6 +789,8 @@ void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value)
#pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
for (i=0; i<sz; i++)
rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]);
+ } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+ TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
} else {
TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
}
@@ -1100,6 +1121,8 @@ void THTensor_(tpow)(THTensor *r_, real value, THTensor *t)
#pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
for (i=0; i<sz; i++)
rp[i] = pow(value, tp[i]);
+ } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+ TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = pow(value, *t_data););
} else {
TH_TENSOR_APPLY2(real, r_, real, t, *r__data = pow(value, *t_data););
}
@@ -2540,6 +2563,13 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
for (i=0; i<sz; ++i){
if(tap[i] != tbp[i]) return 0;
}
+ } else if (THTensor_(isContiguous)(ta) || THTensor_(isContiguous)(tb)) {
+ // Short-circuit the apply function on inequality
+ TH_TENSOR_APPLY2_CONTIGUOUS(real, ta, real, tb,
+ if (equal && *ta_data != *tb_data) {
+ equal = 0;
+ TH_TENSOR_APPLY_hasFinished = 1; break;
+ })
} else {
// Short-circuit the apply function on inequality
TH_TENSOR_APPLY2(real, ta, real, tb,