TH_TENSOR_APPLY2 contiguous optimization

author: jokeren <robinho364@gmail.com> 2017-01-31 17:44:12 +0300
committer: Soumith Chintala <soumith@gmail.com> 2017-02-23 14:01:13 +0300
commit: e4c263784baf5e5f7d62c4be2156d7c3f900fa26 (patch)
tree: 5c6f68acbf315ee53ddf085171e237e0d04d11c0
parent: 8a5e7595ae4e3160891c9107d057784e2282b5cc (diff)
2 files changed, 215 insertions, 6 deletions
diff --git a/lib/TH/THTensorApply.h b/lib/TH/THTensorApply.h
index 17c1837..b88983b 100644
--- a/lib/TH/THTensorApply.h
+++ b/lib/TH/THTensorApply.h
@@ -255,7 +255,7 @@
     for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; TENSOR1##_i--) \
     { \
       if(TENSOR1->stride[TENSOR1##_i] != TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) \
-	TENSOR1##_dim++; \
+        TENSOR1##_dim++; \
     } \
     TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR1##_dim)); \
     TENSOR1##_sizes = TENSOR1##_counter + TENSOR1##_dim; \
@@ -271,8 +271,8 @@
         TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i] * TENSOR1##_sizes[TH_TENSOR_dim_index]; \
       } else { \
         --TH_TENSOR_dim_index; \
-	TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i]; \
-	TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1##_i]; \
+        TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i]; \
+        TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1##_i]; \
       } \
     } \
     TENSOR1##_size = TENSOR1##_sizes[TENSOR1##_dim-1]; \
@@ -283,7 +283,7 @@
     for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; TENSOR2##_i--) \
     { \
       if(TENSOR2->stride[TENSOR2##_i] != TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) \
-	TENSOR2##_dim++; \
+        TENSOR2##_dim++; \
     } \
     TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR2##_dim)); \
     TENSOR2##_sizes = TENSOR2##_counter + TENSOR2##_dim; \
@@ -299,8 +299,8 @@
         TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i] * TENSOR2##_sizes[TH_TENSOR_dim_index]; \
       } else { \
         --TH_TENSOR_dim_index; \
-	TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i]; \
-	TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2##_i]; \
+        TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i]; \
+        TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2##_i]; \
       } \
     } \
     TENSOR2##_size = TENSOR2##_sizes[TENSOR2##_dim-1]; \
@@ -380,6 +380,185 @@
   THFree(TENSOR2##_counter); \
 }
 
+#define TH_TENSOR_APPLY2_CONTIGUOUS(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
+{ \
+  TYPE1 *TENSOR1##_data = NULL; \
+  long *TENSOR1##_counter = NULL, *TENSOR1##_sizes = NULL, *TENSOR1##_strides = NULL; \
+  long TENSOR1##_stride = 0, TENSOR1##_size = 0, TENSOR1##_dim = 0, TENSOR1##_i, TENSOR1##_n; \
+  TYPE2 *TENSOR2##_data = NULL; \
+  long *TENSOR2##_counter = NULL, *TENSOR2##_sizes = NULL, *TENSOR2##_strides = NULL; \
+  long TENSOR2##_stride = 0, TENSOR2##_size = 0, TENSOR2##_dim = 0, TENSOR2##_i, TENSOR2##_n; \
+  int TH_TENSOR_APPLY_hasFinished = 0; \
+  int TH_TENSOR1_contiguous = 0, TH_TENSOR2_contiguous = 0; \
+  long TH_TENSOR_dim_index = 0; \
+\
+  TENSOR1##_n = (TENSOR1->nDimension ? 1 : 0); \
+  for(TENSOR1##_i = 0; TENSOR1##_i < TENSOR1->nDimension; TENSOR1##_i++) \
+    TENSOR1##_n *= TENSOR1->size[TENSOR1##_i]; \
+\
+  TENSOR2##_n = (TENSOR2->nDimension ? 1 : 0); \
+  for(TENSOR2##_i = 0; TENSOR2##_i < TENSOR2->nDimension; TENSOR2##_i++) \
+    TENSOR2##_n *= TENSOR2->size[TENSOR2##_i]; \
+\
+  if(TENSOR1##_n != TENSOR2##_n) /* should we do the check in the function instead? i think so */ \
+    THError("inconsistent tensor size"); \
+\
+  if(TENSOR1->nDimension == 0) \
+    TH_TENSOR_APPLY_hasFinished = 1; \
+  else \
+  { \
+    TENSOR1##_data = TENSOR1->storage->data+TENSOR1->storageOffset; \
+    if (!THTensor_(isContiguous)(TENSOR1)) { \
+      TENSOR1##_dim = 1; \
+      for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; TENSOR1##_i--) \
+      { \
+        if(TENSOR1->stride[TENSOR1##_i] != TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) \
+          TENSOR1##_dim++; \
+      } \
+      TENSOR1##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR1##_dim)); \
+      TENSOR1##_sizes = TENSOR1##_counter + TENSOR1##_dim; \
+      TENSOR1##_strides = TENSOR1##_counter + 2*TENSOR1##_dim; \
+      TH_TENSOR_dim_index = TENSOR1##_dim-1; \
+      TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1->nDimension-1]; \
+      TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1->nDimension-1]; \
+      for(TENSOR1##_i = TENSOR1##_dim-1; TENSOR1##_i >= 0; --TENSOR1##_i) { \
+        TENSOR1##_counter[TENSOR1##_i] = 0; \
+      } \
+      for(TENSOR1##_i = TENSOR1->nDimension-2; TENSOR1##_i >= 0; --TENSOR1##_i) { \
+        if (TENSOR1->stride[TENSOR1##_i] == TENSOR1->stride[TENSOR1##_i+1] * TENSOR1->size[TENSOR1##_i+1]) { \
+          TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i] * TENSOR1##_sizes[TH_TENSOR_dim_index]; \
+        } else { \
+          --TH_TENSOR_dim_index; \
+          TENSOR1##_sizes[TH_TENSOR_dim_index] = TENSOR1->size[TENSOR1##_i]; \
+          TENSOR1##_strides[TH_TENSOR_dim_index] = TENSOR1->stride[TENSOR1##_i]; \
+        } \
+      } \
+      TENSOR1##_size = TENSOR1##_sizes[TENSOR1##_dim-1]; \
+      TENSOR1##_stride = TENSOR1##_strides[TENSOR1##_dim-1]; \
+    } else { \
+      TH_TENSOR1_contiguous = 1; \
+      TENSOR1##_size = THTensor_(nElement)(TENSOR1); \
+      TENSOR1##_stride = 1; \
+    } \
+\
+    TENSOR2##_data = TENSOR2->storage->data+TENSOR2->storageOffset; \
+    if (!THTensor_(isContiguous)(TENSOR2)) { \
+      TENSOR2##_dim = 1; \
+      for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; TENSOR2##_i--) \
+      { \
+        if(TENSOR2->stride[TENSOR2##_i] != TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) \
+          TENSOR2##_dim++; \
+      } \
+      TENSOR2##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR2##_dim)); \
+      TENSOR2##_sizes = TENSOR2##_counter + TENSOR2##_dim; \
+      TENSOR2##_strides = TENSOR2##_counter + 2*TENSOR2##_dim; \
+      TH_TENSOR_dim_index = TENSOR2##_dim-1; \
+      TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2->nDimension-1]; \
+      TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2->nDimension-1]; \
+      for(TENSOR2##_i = TENSOR2##_dim-1; TENSOR2##_i >= 0; --TENSOR2##_i) { \
+        TENSOR2##_counter[TENSOR2##_i] = 0; \
+      } \
+      for(TENSOR2##_i = TENSOR2->nDimension-2; TENSOR2##_i >= 0; --TENSOR2##_i) { \
+        if (TENSOR2->stride[TENSOR2##_i] == TENSOR2->stride[TENSOR2##_i+1] * TENSOR2->size[TENSOR2##_i+1]) { \
+          TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i] * TENSOR2##_sizes[TH_TENSOR_dim_index]; \
+        } else { \
+          --TH_TENSOR_dim_index; \
+          TENSOR2##_sizes[TH_TENSOR_dim_index] = TENSOR2->size[TENSOR2##_i]; \
+          TENSOR2##_strides[TH_TENSOR_dim_index] = TENSOR2->stride[TENSOR2##_i]; \
+        } \
+      } \
+      TENSOR2##_size = TENSOR2##_sizes[TENSOR2##_dim-1]; \
+      TENSOR2##_stride = TENSOR2##_strides[TENSOR2##_dim-1]; \
+    } else { \
+      TH_TENSOR2_contiguous = 1; \
+      TENSOR2##_size = THTensor_(nElement)(TENSOR2); \
+      TENSOR2##_stride = 1; \
+    } \
+  } \
+\
+  TENSOR1##_i = 0; \
+  TENSOR2##_i = 0; \
+  while(!TH_TENSOR_APPLY_hasFinished) \
+  { \
+    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
+    { \
+      CODE \
+    } \
+\
+    if(TENSOR1##_i == TENSOR1##_size) \
+    { \
+      if(TH_TENSOR1_contiguous == 1) \
+	break; \
+\
+      if(TENSOR1##_dim == 1) \
+         break; \
+\
+      TENSOR1##_data -= TENSOR1##_size*TENSOR1##_stride; \
+      for(TENSOR1##_i = TENSOR1##_dim-2; TENSOR1##_i >= 0; TENSOR1##_i--) \
+      { \
+        TENSOR1##_counter[TENSOR1##_i]++; \
+        TENSOR1##_data += TENSOR1##_strides[TENSOR1##_i]; \
+\
+        if(TENSOR1##_counter[TENSOR1##_i]  == TENSOR1##_sizes[TENSOR1##_i]) \
+        { \
+          if(TENSOR1##_i == 0) \
+          { \
+            TH_TENSOR_APPLY_hasFinished = 1; \
+            break; \
+          } \
+            else \
+          { \
+            TENSOR1##_data -= TENSOR1##_counter[TENSOR1##_i]*TENSOR1##_strides[TENSOR1##_i]; \
+            TENSOR1##_counter[TENSOR1##_i] = 0; \
+          } \
+        } \
+        else \
+          break; \
+      } \
+      TENSOR1##_i = 0; \
+    } \
+\
+    if(TENSOR2##_i == TENSOR2##_size) \
+    { \
+      if(TH_TENSOR2_contiguous == 1) \
+	break; \
+\
+      if(TENSOR2##_dim == 1) \
+         break; \
+\
+      TENSOR2##_data -= TENSOR2##_size*TENSOR2##_stride; \
+      for(TENSOR2##_i = TENSOR2##_dim-2; TENSOR2##_i >= 0; TENSOR2##_i--) \
+      { \
+        TENSOR2##_counter[TENSOR2##_i]++; \
+        TENSOR2##_data += TENSOR2##_strides[TENSOR2##_i]; \
+\
+        if(TENSOR2##_counter[TENSOR2##_i]  == TENSOR2##_sizes[TENSOR2##_i]) \
+        { \
+          if(TENSOR2##_i == 0) \
+          { \
+            TH_TENSOR_APPLY_hasFinished = 1; \
+            break; \
+          } \
+            else \
+          { \
+            TENSOR2##_data -= TENSOR2##_counter[TENSOR2##_i]*TENSOR2##_strides[TENSOR2##_i]; \
+            TENSOR2##_counter[TENSOR2##_i] = 0; \
+          } \
+        } \
+        else \
+          break; \
+      } \
+      TENSOR2##_i = 0; \
+    } \
+  } \
+  if (!THTensor_(isContiguous)(TENSOR1)) { \
+    THFree(TENSOR1##_counter); \
+  } \
+  if (!THTensor_(isContiguous)(TENSOR2)) { \
+    THFree(TENSOR2##_counter); \
+  } \
+}
+
 /*
  * The basic strategy for apply is as follows:
  *
diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c
index c27fa00..302923e 100644
--- a/lib/TH/generic/THTensorMath.c
+++ b/lib/TH/generic/THTensorMath.c
@@ -508,6 +508,8 @@ void THTensor_(add)(THTensor *r_, THTensor *t, real value)
       ptrdiff_t i_end = tid == num_threads - 1 ? sz : i + sz / num_threads;
       THVector_(add)(rp+i, tp+i, value, i_end-i); 
     }
+  } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+    TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data + value;);
   } else {
     TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
   }
@@ -538,6 +540,8 @@ void THTensor_(mul)(THTensor *r_, THTensor *t, real value)
       ptrdiff_t i_end = tid == num_threads - 1 ? sz : i + sz / num_threads;
       THVector_(mul)(rp+i, tp+i, value, i_end-i); 
     }
+  } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+    TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data * value;);
   } else {
     TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
   }
@@ -563,6 +567,8 @@ void THTensor_(div)(THTensor *r_, THTensor *t, real value)
       ptrdiff_t i_end = tid == num_threads - 1 ? sz : i + sz / num_threads;
       THVector_(div)(rp+i, tp+i, value, i_end-i); 
     }
+  } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+    TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data / value;);
   } else {
     TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;);
   }
@@ -654,6 +660,12 @@ void THTensor_(fmod)(THTensor *r_, THTensor *t, real value)
           rp[i] = tp[i] % value;
 #endif
       }
+  } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+      TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = fmod(*t_data, value););
+#else
+      TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = (*t_data % value););
+#endif
   } else {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
       TH_TENSOR_APPLY2(real, r_, real, t, *r__data = fmod(*t_data, value););
@@ -679,6 +691,13 @@ void THTensor_(remainder)(THTensor *r_, THTensor *t, real value)
           rp[i] = tp[i] - value * (tp[i] / value); // There is no NAN for integers
 #endif
       }
+  } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
+      TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value););
+#else
+       // There is no NAN for integers
+      TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = *t_data - value * (*t_data / value););
+#endif
   } else {
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
       TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value););
@@ -770,6 +789,8 @@ void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value)
     #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
     for (i=0; i<sz; i++)
       rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]);
+  } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+    TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
   } else {
     TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
   }
@@ -1100,6 +1121,8 @@ void THTensor_(tpow)(THTensor *r_, real value, THTensor *t)
     #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
     for (i=0; i<sz; i++)
       rp[i] = pow(value, tp[i]);
+  } else if (THTensor_(isContiguous)(r_) || THTensor_(isContiguous)(t)) {
+    TH_TENSOR_APPLY2_CONTIGUOUS(real, r_, real, t, *r__data = pow(value, *t_data););
   } else {
     TH_TENSOR_APPLY2(real, r_, real, t, *r__data = pow(value, *t_data););
   }
@@ -2540,6 +2563,13 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
     for (i=0; i<sz; ++i){
       if(tap[i] != tbp[i]) return 0;
     }
+  } else if (THTensor_(isContiguous)(ta) || THTensor_(isContiguous)(tb)) {
+    // Short-circuit the apply function on inequality
+    TH_TENSOR_APPLY2_CONTIGUOUS(real, ta, real, tb,
+                                if (equal && *ta_data != *tb_data) {
+                                   equal = 0;
+                                   TH_TENSOR_APPLY_hasFinished = 1; break;
+                                })
   } else {
     // Short-circuit the apply function on inequality
     TH_TENSOR_APPLY2(real, ta, real, tb,
author	jokeren <robinho364@gmail.com>	2017-01-31 17:44:12 +0300
committer	Soumith Chintala <soumith@gmail.com>	2017-02-23 14:01:13 +0300
commit	e4c263784baf5e5f7d62c4be2156d7c3f900fa26 (patch)
tree	5c6f68acbf315ee53ddf085171e237e0d04d11c0
parent	8a5e7595ae4e3160891c9107d057784e2282b5cc (diff)