45 files changed, 2479 insertions, 1735 deletions
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index 3a035de..dfc7ec3 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -124,16 +124,21 @@ SET(src-cuda
   THCTensorConv.cu
   THCTensorRandom.cu
   THCTensorScatterGather.cu
-  THCApply.cu
   THCTensorSort.cu
   THCTensorTopK.cu
+  THCTensorTypeUtils.cu
   )
 
+MESSAGE(STATUS "got cuda version " ${CUDA_VERSION})
+
 IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+  MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor")
   LIST(APPEND src-cuda THCHalf.cu)
   LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1")
   SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}")
-ENDIF()
+ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+  MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
+ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
 
 CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda})
 CUDA_ADD_CUBLAS_TO_TARGET(THC)
@@ -176,6 +181,8 @@ INSTALL(FILES
           THCDeviceTensorUtils-inl.cuh
           THCGenerateAllTypes.h
           THCHalf.h
+          THCTensorInfo.cuh
+          THCTensorTypeUtils.cuh
           DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC")
 
 INSTALL(FILES
@@ -191,4 +198,10 @@ INSTALL(FILES
           generic/THCTensorCopy.c
           generic/THCTensorCopy.cu
           generic/THCTensorCopy.h
+          generic/THCTensorMath.h
+          generic/THCTensorMath.cu
+          generic/THCTensorMathPairwise.h
+          generic/THCTensorMathPairwise.cu
+          generic/THCTensorMathPointwise.h
+          generic/THCTensorMathPointwise.cu
           DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/lib/THC/THCApply.cu b/lib/THC/THCApply.cu
deleted file mode 100644
index 3ee9d51..0000000
--- a/lib/THC/THCApply.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "THCApply.cuh"
-
-// Implementation of copyIgnoringOverlaps, defined after pointwiseApply2.
-void THCudaTensor_copyIgnoringOverlaps(THCState* state,
-                                       THCudaTensor* dst,
-                                       THCudaTensor* src) {
-  THCudaTensor_pointwiseApply2(state, dst, src, CopyOp<float>(),
-                               ReadOnly, // ignore overwrites
-                               ReadOnly);
-}
diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh
index 707e22f..dd6d32a 100644
--- a/lib/THC/THCApply.cuh
+++ b/lib/THC/THCApply.cuh
@@ -3,6 +3,7 @@
 
 #include "THCTensorCopy.h"
 #include "THCReduceApplyUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
 
 //
 // This file contains pointwise operation functions and kernels that
@@ -12,81 +13,85 @@
 //
 
 // Threads per block for our apply kernel
+// FIXME: use occupancy calculator instead
 #define THC_APPLY_THREADS_PER_BLOCK 32 * 16
 
-// Called when we are copying into an overlapping index `dst`, but
-// we don't care which writer wins. Hacky but it works.
-THC_API void THCudaTensor_copyIgnoringOverlaps(THCState* state,
-                                       THCudaTensor* dst,
-                                       THCudaTensor* src);
-
-template <typename Op, typename IndexType, int ADims>
+template <typename Op,
+          typename Ta,
+          typename IndexType,
+          int ADims>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(32 * 16, 4)
 #endif
 __global__ void
-THCudaTensor_pointwiseApply1(TensorInfo<IndexType> a,
-                             IndexType totalElements,
-                             Op op) {
+kernelPointwiseApply1(TensorInfo<Ta, IndexType> a,
+                      IndexType totalElements,
+                      Op op) {
   for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < totalElements;
        linearIndex += gridDim.x * blockDim.x) {
     // Convert `linearIndex` into an offset of `a`
     const IndexType aOffset =
-      IndexToOffset<IndexType, ADims>::get(linearIndex, a);
+      IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);
 
     op(&a.data[aOffset]);
   }
 }
 
-template <typename Op, typename IndexType, int ADims, int BDims>
+template <typename Op,
+          typename Ta, typename Tb,
+          typename IndexType,
+          int ADims, int BDims>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(32 * 16, 4)
 #endif
 __global__ void
-THCudaTensor_pointwiseApply2(TensorInfo<IndexType> a,
-                             TensorInfo<IndexType> b,
-                             IndexType totalElements,
-                             Op op) {
+kernelPointwiseApply2(TensorInfo<Ta, IndexType> a,
+                      TensorInfo<Tb, IndexType> b,
+                      IndexType totalElements,
+                      Op op) {
   for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < totalElements;
        linearIndex += gridDim.x * blockDim.x) {
     // Convert `linearIndex` into an offset of `a`
     const IndexType aOffset =
-      IndexToOffset<IndexType, ADims>::get(linearIndex, a);
+      IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);
 
     // Convert `linearIndex` into an offset of `b`
     const IndexType bOffset =
-      IndexToOffset<IndexType, BDims>::get(linearIndex, b);
+      IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b);
 
     op(&a.data[aOffset], &b.data[bOffset]);
   }
 }
 
-template <typename Op, typename IndexType, int ADims, int BDims, int CDims>
+template <typename Op,
+          typename Ta, typename Tb, typename Tc,
+          typename IndexType,
+          int ADims, int BDims, int CDims>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(32 * 16, 4)
 #endif
 __global__ void
-THCudaTensor_pointwiseApply3(TensorInfo<IndexType> a,
-                             TensorInfo<IndexType> b,
-                             TensorInfo<IndexType> c,
-                             IndexType totalElements,
-                             Op op) {
+kernelPointwiseApply3(TensorInfo<Ta, IndexType> a,
+                      TensorInfo<Tb, IndexType> b,
+                      TensorInfo<Tc, IndexType> c,
+                      IndexType totalElements,
+                      Op op) {
   for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < totalElements;
        linearIndex += gridDim.x * blockDim.x) {
     // Convert `linearIndex` into an offset of `a`
     const IndexType aOffset =
-      IndexToOffset<IndexType, ADims>::get(linearIndex, a);
+      IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);
 
     // Convert `linearIndex` into an offset of `b`
     const IndexType bOffset =
-      IndexToOffset<IndexType, BDims>::get(linearIndex, b);
+      IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b);
 
     // Convert `linearIndex` into an offset of `c`
     const IndexType cOffset =
-      IndexToOffset<IndexType, CDims>::get(linearIndex, c);
+      IndexToOffset<Tc, IndexType, CDims>::get(linearIndex, c);
 
     op(&a.data[aOffset], &b.data[bOffset], &c.data[cOffset]);
   }
@@ -116,18 +121,17 @@ inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) {
   return true;
 }
 
-template <typename Op>
-bool THCudaTensor_pointwiseApply1(THCState* state,
-                                  THCudaTensor* a,
-                                  const Op& op,
-                                  TensorArgType aType = ReadWrite) {
-  long totalElements = THCudaTensor_nElement(state, a);
-
-  if (THCudaTensor_nDimension(state, a) > MAX_CUTORCH_DIMS) {
+template <typename TensorTypeA,
+          typename Op>
+bool THC_pointwiseApply1(THCState* state,
+                         TensorTypeA* a,
+                         const Op& op,
+                         TensorArgType aType = ReadWrite) {
+  if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS) {
     return false;
   }
 
-  if (THCudaTensor_nDimension(state, a) == 0) {
+  if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }
@@ -135,6 +139,8 @@ bool THCudaTensor_pointwiseApply1(THCState* state,
   const dim3 block = getApplyBlock();
 
   dim3 grid;
+  long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+
   if (!getApplyGrid(state, totalElements, grid)) {
     return false;
   }
@@ -148,12 +154,13 @@ bool THCudaTensor_pointwiseApply1(THCState* state,
   // indices of a tensor with overlapping indices should probably be
   // an error, since it is unclear which one should win), but we will
   // preserve this last-writer-wins (in arbitrary copy order) behavior.
-  THCudaTensor* oldA = NULL;
+  TensorTypeA* oldA = NULL;
 
-  if (aType == ReadWrite && THC_overlappingIndices(state, a)) {
+  if (aType == ReadWrite &&
+      TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
     // Must perform in contiguous space
     oldA = a;
-    a = THCudaTensor_newContiguous(state, a);
+    a = TensorUtils<TensorTypeA>::newContiguous(state, a);
   }
 
   // It is possible that the tensor dimensions are able to be collapsed,
@@ -164,55 +171,60 @@ bool THCudaTensor_pointwiseApply1(THCState* state,
   // (or vice versa), the contiguous tensor can be collapsed to one
   // dimension, and the loop to translate the linear index to the array
   // index can be similarly collapsed. That is what this unrolling is for.
-#define HANDLE_CASE(TYPE, A)                                   \
-  THCudaTensor_pointwiseApply1<Op, TYPE, A>                    \
-    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(    \
+#define HANDLE_CASE(TYPE, A)                                            \
+  kernelPointwiseApply1<Op,                                             \
+                        typename TensorUtils<TensorTypeA>::DataType,   \
+                        TYPE, A>                                        \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
       aInfo, (TYPE) totalElements, op);
 
-#define HANDLE_A_CASE(TYPE, A)                      \
-  {                                                 \
-    if (aInfo.isContiguous()) {                     \
-      HANDLE_CASE(TYPE, -2);                        \
-    } else {                                        \
-      switch (A) {                                  \
-        case 1:                                     \
-          HANDLE_CASE(TYPE, 1);                     \
-          break;                                    \
-        case 2:                                     \
-          HANDLE_CASE(TYPE, 2);                     \
-          break;                                    \
-        case 3:                                     \
-          HANDLE_CASE(TYPE, 3);                     \
-          break;                                    \
-        default:                                    \
-          HANDLE_CASE(TYPE, -1);                    \
-          break;                                    \
-      }                                             \
-    }                                               \
+#define HANDLE_A_CASE(TYPE, A)                  \
+  {                                             \
+    if (aInfo.isContiguous()) {                 \
+      HANDLE_CASE(TYPE, -2);                    \
+    } else {                                    \
+      switch (A) {                              \
+        case 1:                                 \
+        HANDLE_CASE(TYPE, 1);                   \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_CASE(TYPE, 2);                   \
+        break;                                  \
+        default:                                \
+        HANDLE_CASE(TYPE, -1);                  \
+        break;                                  \
+      }                                         \
+    }                                           \
   }
 
   // Can we use 32-bit integer math in the kernel (the linear ID for the copy
   // and the resulting non-linear offset is all computable using 32-bit math?)
   // We also use unsigned index math in the kernel, as signed div/mod has
   // additional overhead.
-  if (THC_canUse32BitIndexMath(state, a)) {
-    TensorInfo<unsigned int> aInfo(state, a);
+  if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a)) {
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
+      getTensorInfo<TensorTypeA, unsigned int>(state, a);
     aInfo.collapseDims();
 
     HANDLE_A_CASE(unsigned int, aInfo.dims);
   } else {
-    TensorInfo<unsigned long> aInfo(state, a);
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo =
+      getTensorInfo<TensorTypeA, unsigned long>(state, a);
     aInfo.collapseDims();
 
     // For large tensors, we only compile the completely contiguous
     // version and the completely generic version, to reduce
     // compilation time.
     if (aInfo.isContiguous()) {
-      THCudaTensor_pointwiseApply1<Op, unsigned long, -2>
+      kernelPointwiseApply1<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            unsigned long, -2>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, (unsigned long) totalElements, op);
     } else {
-      THCudaTensor_pointwiseApply1<Op, unsigned long, -1>
+      kernelPointwiseApply1<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            unsigned long, -1>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, (unsigned long) totalElements, op);
     }
@@ -221,36 +233,38 @@ bool THCudaTensor_pointwiseApply1(THCState* state,
 #undef HANDLE_A_CASE
 
   if (oldA) {
-    // Ignore overlaps when copying back; if we use THCudaTensor_copy
+    // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
-    THCudaTensor_copyIgnoringOverlaps(state, oldA, a);
-    THCudaTensor_free(state, a);
+    TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
+    TensorUtils<TensorTypeA>::free(state, a);
     a = oldA;
   }
 
   return true;
 }
 
-template <typename Op>
-bool THCudaTensor_pointwiseApply2(THCState* state,
-                                  THCudaTensor* a,
-                                  THCudaTensor* b,
-                                  const Op& op,
-                                  TensorArgType aType = ReadWrite,
-                                  TensorArgType bType = ReadOnly) {
-  long totalElements = THCudaTensor_nElement(state, a);
-
-  if (totalElements != THCudaTensor_nElement(state, b)) {
+template <typename TensorTypeA,
+          typename TensorTypeB,
+          typename Op>
+bool THC_pointwiseApply2(THCState* state,
+                         TensorTypeA* a,
+                         TensorTypeB* b,
+                         const Op& op,
+                         TensorArgType aType = ReadWrite,
+                         TensorArgType bType = ReadOnly) {
+  long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+
+  if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) {
     return false;
   }
 
-  if (THCudaTensor_nDimension(state, a) > MAX_CUTORCH_DIMS ||
-      THCudaTensor_nDimension(state, b) > MAX_CUTORCH_DIMS) {
+  if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS ||
+      TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS) {
     return false;
   }
 
-  if (THCudaTensor_nDimension(state, a) == 0) {
+  if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }
@@ -271,18 +285,20 @@ bool THCudaTensor_pointwiseApply2(THCState* state,
   // indices of a tensor with overlapping indices should probably be
   // an error, since it is unclear which one should win), but we will
   // preserve this last-writer-wins (in arbitrary copy order) behavior.
-  THCudaTensor* oldA = NULL;
-  THCudaTensor* oldB = NULL;
+  TensorTypeA* oldA = NULL;
+  TensorTypeB* oldB = NULL;
 
-  if (aType == ReadWrite && THC_overlappingIndices(state, a)) {
+  if (aType == ReadWrite &&
+      TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
     // Must perform in contiguous space
     oldA = a;
-    a = THCudaTensor_newContiguous(state, a);
+    a = TensorUtils<TensorTypeA>::newContiguous(state, a);
   }
-  if (bType == ReadWrite && THC_overlappingIndices(state, b)) {
+  if (bType == ReadWrite &&
+      TensorUtils<TensorTypeB>::overlappingIndices(state, b)) {
     // Must perform in contiguous space
     oldB = b;
-    b = THCudaTensor_newContiguous(state, b);
+    b = TensorUtils<TensorTypeB>::newContiguous(state, b);
   }
 
   // It is possible that the tensor dimensions are able to be collapsed,
@@ -293,80 +309,87 @@ bool THCudaTensor_pointwiseApply2(THCState* state,
   // (or vice versa), the contiguous tensor can be collapsed to one
   // dimension, and the loop to translate the linear index to the array
   // index can be similarly collapsed. That is what this unrolling is for.
-#define HANDLE_CASE(TYPE, A, B)                                \
-  THCudaTensor_pointwiseApply2<Op, TYPE, A, B>                 \
-    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(    \
+#define HANDLE_CASE(TYPE, A, B)                                         \
+  kernelPointwiseApply2<Op,                                             \
+                        typename TensorUtils<TensorTypeA>::DataType,    \
+                        typename TensorUtils<TensorTypeB>::DataType,    \
+                        TYPE, A, B>                                     \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
       aInfo, bInfo, (TYPE) totalElements, op);
 
-#define HANDLE_B_CASE(TYPE, A, B)                   \
-  {                                                 \
-    if (bInfo.isContiguous()) {                     \
-      HANDLE_CASE(TYPE, A, -2);                     \
-    } else {                                        \
-      switch (B) {                                  \
-        case 1:                                     \
-          HANDLE_CASE(TYPE, A, 1);                  \
-          break;                                    \
-        case 2:                                     \
-          HANDLE_CASE(TYPE, A, 2);                  \
-          break;                                    \
-        case 3:                                     \
-          HANDLE_CASE(TYPE, A, 3);                  \
-          break;                                    \
-        default:                                    \
-          HANDLE_CASE(TYPE, A, -1);                 \
-          break;                                    \
-      }                                             \
-    }                                               \
-  }
-
-#define HANDLE_A_CASE(TYPE, A, B)                   \
-  {                                                 \
-    if (aInfo.isContiguous()) {                     \
-      HANDLE_B_CASE(TYPE, -2, B);                   \
-    } else {                                        \
-      switch (A) {                                  \
-        case 1:                                     \
-          HANDLE_B_CASE(TYPE, 1, B);                \
-          break;                                    \
-        case 2:                                     \
-          HANDLE_B_CASE(TYPE, 2, B);                \
-          break;                                    \
-        case 3:                                     \
-          HANDLE_B_CASE(TYPE, 3, B);                \
-          break;                                    \
-        default:                                    \
-          HANDLE_B_CASE(TYPE, -1, B);               \
-          break;                                    \
-      }                                             \
-    }                                               \
-  }
-
-  if (THC_canUse32BitIndexMath(state, a) &&
-      THC_canUse32BitIndexMath(state, b)) {
-    TensorInfo<unsigned int> aInfo(state, a);
+#define HANDLE_B_CASE(TYPE, A, B)               \
+  {                                             \
+    if (bInfo.isContiguous()) {                 \
+      HANDLE_CASE(TYPE, A, -2);                 \
+    } else {                                    \
+      switch (B) {                              \
+        case 1:                                 \
+        HANDLE_CASE(TYPE, A, 1);                \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_CASE(TYPE, A, 2);                \
+        break;                                  \
+        default:                                \
+        HANDLE_CASE(TYPE, A, -1);               \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+#define HANDLE_A_CASE(TYPE, A, B)               \
+  {                                             \
+    if (aInfo.isContiguous()) {                 \
+      HANDLE_B_CASE(TYPE, -2, B);               \
+    } else {                                    \
+      switch (A) {                              \
+        case 1:                                 \
+        HANDLE_B_CASE(TYPE, 1, B);              \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_B_CASE(TYPE, 2, B);              \
+        break;                                  \
+        default:                                \
+        HANDLE_B_CASE(TYPE, -1, B);             \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+  if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
+      TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b)) {
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
+      getTensorInfo<TensorTypeA, unsigned int>(state, a);
     aInfo.collapseDims();
 
-    TensorInfo<unsigned int> bInfo(state, b);
+    TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo =
+      getTensorInfo<TensorTypeB, unsigned int>(state, b);
     bInfo.collapseDims();
 
     HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
   } else {
-    TensorInfo<unsigned long> aInfo(state, a);
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo =
+      getTensorInfo<TensorTypeA, unsigned long>(state, a);
     aInfo.collapseDims();
 
-    TensorInfo<unsigned long> bInfo(state, b);
+    TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned long> bInfo =
+      getTensorInfo<TensorTypeB, unsigned long>(state, b);
     bInfo.collapseDims();
 
     // For large tensors, we only compile the completely contiguous
     // version and the completely generic version, to reduce
     // compilation time.
     if (aInfo.isContiguous() && bInfo.isContiguous()) {
-      THCudaTensor_pointwiseApply2<Op, unsigned long, -2, -2>
+      kernelPointwiseApply2<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            typename TensorUtils<TensorTypeB>::DataType,
+                            unsigned long, -2, -2>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, bInfo, (unsigned long) totalElements, op);
     } else {
-      THCudaTensor_pointwiseApply2<Op, unsigned long, -1, -1>
+      kernelPointwiseApply2<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            typename TensorUtils<TensorTypeB>::DataType,
+                            unsigned long, -1, -1>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, bInfo, (unsigned long) totalElements, op);
     }
@@ -376,49 +399,52 @@ bool THCudaTensor_pointwiseApply2(THCState* state,
 #undef HANDLE_A_CASE
 
   if (oldA) {
-    // Ignore overlaps when copying back; if we use THCudaTensor_copy
+    // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
-    THCudaTensor_copyIgnoringOverlaps(state, oldA, a);
-    THCudaTensor_free(state, a);
+    TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
+    TensorUtils<TensorTypeA>::free(state, a);
     a = oldA;
   }
 
   if (oldB) {
-    // Ignore overlaps when copying back; if we use THCudaTensor_copy
+    // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
-    THCudaTensor_copyIgnoringOverlaps(state, oldB, b);
-    THCudaTensor_free(state, b);
+    TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
+    TensorUtils<TensorTypeB>::free(state, b);
     b = oldB;
   }
 
   return true;
 }
 
-template <typename Op>
-bool THCudaTensor_pointwiseApply3(THCState* state,
-                                  THCudaTensor* a,
-                                  THCudaTensor* b,
-                                  THCudaTensor* c,
-                                  const Op& op,
-                                  TensorArgType aType = ReadWrite,
-                                  TensorArgType bType = ReadOnly,
-                                  TensorArgType cType = ReadOnly) {
-  long totalElements = THCudaTensor_nElement(state, a);
-
-  if (totalElements != THCudaTensor_nElement(state, b) ||
-      totalElements != THCudaTensor_nElement(state, c)) {
+template <typename TensorTypeA,
+          typename TensorTypeB,
+          typename TensorTypeC,
+          typename Op>
+bool THC_pointwiseApply3(THCState* state,
+                         TensorTypeA* a,
+                         TensorTypeB* b,
+                         TensorTypeC* c,
+                         const Op& op,
+                         TensorArgType aType = ReadWrite,
+                         TensorArgType bType = ReadOnly,
+                         TensorArgType cType = ReadOnly) {
+  long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+
+  if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) ||
+      totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) {
     return false;
   }
 
-  if (THCudaTensor_nDimension(state, a) > MAX_CUTORCH_DIMS ||
-      THCudaTensor_nDimension(state, b) > MAX_CUTORCH_DIMS ||
-      THCudaTensor_nDimension(state, c) > MAX_CUTORCH_DIMS) {
+  if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS ||
+      TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS ||
+      TensorUtils<TensorTypeC>::getDims(state, c) > MAX_CUTORCH_DIMS) {
     return false;
   }
 
-  if (THCudaTensor_nDimension(state, a) == 0) {
+  if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }
@@ -439,131 +465,141 @@ bool THCudaTensor_pointwiseApply3(THCState* state,
   // indices of a tensor with overlapping indices should probably be
   // an error, since it is unclear which one should win), but we will
   // preserve this last-writer-wins (in arbitrary copy order) behavior.
-  THCudaTensor* oldA = NULL;
-  THCudaTensor* oldB = NULL;
-  THCudaTensor* oldC = NULL;
+  TensorTypeA* oldA = NULL;
+  TensorTypeB* oldB = NULL;
+  TensorTypeC* oldC = NULL;
 
-  if (aType == ReadWrite && THC_overlappingIndices(state, a)) {
+  if (aType == ReadWrite &&
+      TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
     // Must perform in contiguous space
     oldA = a;
-    a = THCudaTensor_newContiguous(state, a);
+    a = TensorUtils<TensorTypeA>::newContiguous(state, a);
   }
-
-  if (bType == ReadWrite && THC_overlappingIndices(state, b)) {
+  if (bType == ReadWrite &&
+      TensorUtils<TensorTypeB>::overlappingIndices(state, b)) {
     // Must perform in contiguous space
     oldB = b;
-    b = THCudaTensor_newContiguous(state, b);
+    b = TensorUtils<TensorTypeB>::newContiguous(state, b);
   }
-
-  if (cType == ReadWrite && THC_overlappingIndices(state, c)) {
+  if (cType == ReadWrite &&
+      TensorUtils<TensorTypeC>::overlappingIndices(state, c)) {
     // Must perform in contiguous space
     oldC = c;
-    c = THCudaTensor_newContiguous(state, c);
+    c = TensorUtils<TensorTypeC>::newContiguous(state, c);
   }
 
 #define HANDLE_CASE(TYPE, A, B, C)                                      \
-  THCudaTensor_pointwiseApply3<Op, TYPE, A, B, C>                       \
+  kernelPointwiseApply3<Op,                                             \
+                        typename TensorUtils<TensorTypeA>::DataType,    \
+                        typename TensorUtils<TensorTypeB>::DataType,    \
+                        typename TensorUtils<TensorTypeC>::DataType,    \
+                        TYPE, A, B, C>                                  \
     <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
       aInfo, bInfo, cInfo, (TYPE) totalElements, op);
 
-#define HANDLE_C_CASE(TYPE, A, B, C)             \
-  {                                              \
-    if (cInfo.isContiguous()) {                  \
-      HANDLE_CASE(TYPE, A, B, -2);               \
-    } else {                                     \
-      switch (C) {                               \
-        case 1:                                  \
-          HANDLE_CASE(TYPE, A, B, 1);            \
-          break;                                 \
-        case 2:                                  \
-          HANDLE_CASE(TYPE, A, B, 2);            \
-          break;                                 \
-        case 3:                                  \
-          HANDLE_CASE(TYPE, A, B, 3);            \
-          break;                                 \
-        default:                                 \
-          HANDLE_CASE(TYPE, A, B, -1);           \
-          break;                                 \
-      }                                          \
-    }                                            \
-  }
-
-#define HANDLE_B_CASE(TYPE, A, B, C)                 \
-  {                                                  \
-    if (bInfo.isContiguous()) {                      \
-      HANDLE_C_CASE(TYPE, A, -2, C);                 \
-    } else {                                         \
-      switch (B) {                                   \
-        case 1:                                      \
-          HANDLE_C_CASE(TYPE, A, 1, C);              \
-          break;                                     \
-        case 2:                                      \
-          HANDLE_C_CASE(TYPE, A, 2, C);              \
-          break;                                     \
-        case 3:                                      \
-          HANDLE_C_CASE(TYPE, A, 3, C);              \
-          break;                                     \
-        default:                                     \
-          HANDLE_C_CASE(TYPE, A, -1, C);             \
-          break;                                     \
-      }                                              \
-    }                                                \
-  }
-
-#define HANDLE_A_CASE(TYPE, A, B, C)                 \
-  {                                                  \
-    if (aInfo.isContiguous()) {                      \
-      HANDLE_B_CASE(TYPE, -2, B, C);                 \
-    } else {                                         \
-      switch (A) {                                   \
-        case 1:                                      \
-          HANDLE_B_CASE(TYPE, 1, B, C);              \
-          break;                                     \
-        case 2:                                      \
-          HANDLE_B_CASE(TYPE, 2, B, C);              \
-          break;                                     \
-        case 3:                                      \
-          HANDLE_B_CASE(TYPE, 3, B, C);              \
-          break;                                     \
-        default:                                     \
-          HANDLE_B_CASE(TYPE, -1, B, C);             \
-          break;                                     \
-      }                                              \
-    }                                                \
-  }
-
-  if (THC_canUse32BitIndexMath(state, a) &&
-      THC_canUse32BitIndexMath(state, b) &&
-      THC_canUse32BitIndexMath(state, c)) {
-    TensorInfo<unsigned int> aInfo(state, a);
+#define HANDLE_C_CASE(TYPE, A, B, C)            \
+  {                                             \
+    if (cInfo.isContiguous()) {                 \
+      HANDLE_CASE(TYPE, A, B, -2);              \
+    } else {                                    \
+      switch (C) {                              \
+        case 1:                                 \
+        HANDLE_CASE(TYPE, A, B, 1);             \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_CASE(TYPE, A, B, 2);             \
+        break;                                  \
+        default:                                \
+        HANDLE_CASE(TYPE, A, B, -1);            \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+#define HANDLE_B_CASE(TYPE, A, B, C)            \
+  {                                             \
+    if (bInfo.isContiguous()) {                 \
+      HANDLE_C_CASE(TYPE, A, -2, C);            \
+    } else {                                    \
+      switch (B) {                              \
+        case 1:                                 \
+        HANDLE_C_CASE(TYPE, A, 1, C);           \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_C_CASE(TYPE, A, 2, C);           \
+        break;                                  \
+        default:                                \
+        HANDLE_C_CASE(TYPE, A, -1, C);          \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+#define HANDLE_A_CASE(TYPE, A, B, C)            \
+  {                                             \
+    if (aInfo.isContiguous()) {                 \
+      HANDLE_B_CASE(TYPE, -2, B, C);            \
+    } else {                                    \
+      switch (A) {                              \
+        case 1:                                 \
+        HANDLE_B_CASE(TYPE, 1, B, C);           \
+        break;                                  \
+        case 2:                                 \
+        HANDLE_B_CASE(TYPE, 2, B, C);           \
+        break;                                  \
+        default:                                \
+        HANDLE_B_CASE(TYPE, -1, B, C);          \
+        break;                                  \
+      }                                         \
+    }                                           \
+  }
+
+  if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
+      TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b) &&
+      TensorUtils<TensorTypeC>::canUse32BitIndexMath(state, c)) {
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
+      getTensorInfo<TensorTypeA, unsigned int>(state, a);
     aInfo.collapseDims();
 
-    TensorInfo<unsigned int> bInfo(state, b);
+    TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo =
+      getTensorInfo<TensorTypeB, unsigned int>(state, b);
     bInfo.collapseDims();
 
-    TensorInfo<unsigned int> cInfo(state, c);
+    TensorInfo<typename TensorUtils<TensorTypeC>::DataType, unsigned int> cInfo =
+      getTensorInfo<TensorTypeC, unsigned int>(state, c);
     cInfo.collapseDims();
 
     HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims);
   } else {
-    TensorInfo<unsigned long> aInfo(state, a);
+    TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo =
+      getTensorInfo<TensorTypeA, unsigned long>(state, a);
     aInfo.collapseDims();
 
-    TensorInfo<unsigned long> bInfo(state, b);
+    TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned long> bInfo =
+      getTensorInfo<TensorTypeB, unsigned long>(state, b);
     bInfo.collapseDims();
 
-    TensorInfo<unsigned long> cInfo(state, c);
+    TensorInfo<typename TensorUtils<TensorTypeC>::DataType, unsigned long> cInfo =
+      getTensorInfo<TensorTypeC, unsigned long>(state, c);
     cInfo.collapseDims();
 
     // For large tensors, we only compile the completely contiguous
     // version and the completely generic version, to reduce
     // compilation time.
     if (aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()) {
-      THCudaTensor_pointwiseApply3<Op, unsigned long, -2, -2, -2>
+      kernelPointwiseApply3<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            typename TensorUtils<TensorTypeB>::DataType,
+                            typename TensorUtils<TensorTypeC>::DataType,
+                            unsigned long, -2, -2, -2>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, bInfo, cInfo, (unsigned long) totalElements, op);
     } else {
-      THCudaTensor_pointwiseApply3<Op, unsigned long, -1, -1, -1>
+      kernelPointwiseApply3<Op,
+                            typename TensorUtils<TensorTypeA>::DataType,
+                            typename TensorUtils<TensorTypeB>::DataType,
+                            typename TensorUtils<TensorTypeC>::DataType,
+                            unsigned long, -1, -1, -1>
         <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
           aInfo, bInfo, cInfo, (unsigned long) totalElements, op);
     }
@@ -574,29 +610,29 @@ bool THCudaTensor_pointwiseApply3(THCState* state,
 #undef HANDLE_A_CASE
 
   if (oldA) {
-    // Ignore overlaps when copying back; if we use THCudaTensor_copy
+    // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
-    THCudaTensor_copyIgnoringOverlaps(state, oldA, a);
-    THCudaTensor_free(state, a);
+    TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
+    TensorUtils<TensorTypeA>::free(state, a);
     a = oldA;
   }
 
   if (oldB) {
-    // Ignore overlaps when copying back; if we use THCudaTensor_copy
+    // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
-    THCudaTensor_copyIgnoringOverlaps(state, oldB, b);
-    THCudaTensor_free(state, b);
+    TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
+    TensorUtils<TensorTypeB>::free(state, b);
     b = oldB;
   }
 
   if (oldC) {
-    // Ignore overlaps when copying back; if we use THCudaTensor_copy
+    // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldC contiguous.
-    THCudaTensor_copyIgnoringOverlaps(state, oldC, c);
-    THCudaTensor_free(state, c);
+    TensorUtils<TensorTypeC>::copyIgnoringOverlaps(state, oldC, c);
+    TensorUtils<TensorTypeC>::free(state, c);
     c = oldC;
   }
 
diff --git a/lib/THC/THCGenerateAllTypes.h b/lib/THC/THCGenerateAllTypes.h
index 044cbdf..28037bd 100644
--- a/lib/THC/THCGenerateAllTypes.h
+++ b/lib/THC/THCGenerateAllTypes.h
@@ -2,6 +2,8 @@
 #error "You must define THC_GENERIC_FILE before including THGenerateAllTypes.h"
 #endif
 
+#include "THCHalf.h"
+
 #define THCTypeIdxByte   1
 #define THCTypeIdxChar   2
 #define THCTypeIdxShort  3
@@ -12,10 +14,6 @@
 #define THCTypeIdxHalf   8
 #define THCTypeIdx_(T) TH_CONCAT_2(THCTypeIdx,T)
 
-#define hostreal real
-#define hostrealToReal(x) (x)
-#define realToHostreal(x) (x)
-
 #define real unsigned char
 #define accreal long
 #define Real Byte
@@ -108,14 +106,7 @@
 #undef CReal
 #undef THC_REAL_IS_DOUBLE
 
-#if CUDA_VERSION >= 7050
-
-#undef hostreal
-#undef hostrealToReal
-#undef realToHostreal
-#define hostreal float
-#define hostrealToReal(x) THC_float2half(x);
-#define realToHostreal(x) THC_half2float(x);
+#ifdef CUDA_HALF_TENSOR
 
 #define real half
 #define accreal half
@@ -130,11 +121,7 @@
 #undef CReal
 #undef THC_REAL_IS_HALF
 
-#endif // CUDA_VERSION >= 7050
-
-#undef hostreal
-#undef hostrealToReal
-#undef realToHostreal
+#endif // CUDA_HALF_TENSOR
 
 #undef THCTypeIdxByte
 #undef THCTypeIdxChar
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index e368d2e..7847ef6 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -32,8 +32,25 @@ void THCHalf2Float(THCState *state, float *out, half *in, long len) {
 
 float THC_half2float(half a)
 {
-  THError("half2float not implemented yet");
-  return 0;
+  unsigned int bits = a.x & 0x7fff;
+  unsigned int sign = a.x & 0x8000;
+  unsigned int exp = a.x & 0x7c00;
+
+  bits <<= 13;
+  sign <<= 16;
+
+  bits += 0x38000000U;
+
+  // flush denormals to 0
+  bits = (exp == 0 ? 0 : bits) | sign;
+
+  union {
+    float f;
+    unsigned int v;
+  } conv;
+  conv.v = bits;
+
+  return conv.f;
 }
 
 /*
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index b20b05c..d87647b 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -3,7 +3,17 @@
 
 #include "THCGeneral.h"
 
+// We compile with CudaHalfTensor support if we have this:
 #if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
+#define CUDA_HALF_TENSOR 1
+#endif
+
+// Native fp16 ALU instructions are available if we have this:
+#if defined(CUDA_HALF_TENSOR) && (__CUDA_ARCH__ >= 530)
+#define CUDA_HALF_INSTRUCTIONS 1
+#endif
+
+#ifdef CUDA_HALF_TENSOR
 
 #include <cuda_fp16.h>
 #include <stdint.h>
@@ -13,6 +23,6 @@ THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
 THC_EXTERNC half THC_float2half(float a);
 THC_EXTERNC float THC_half2float(half a);
 
-#endif
+#endif // CUDA_HALF_TENSOR
 
 #endif
diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh
index 1060e89..89f599f 100644
--- a/lib/THC/THCReduce.cuh
+++ b/lib/THC/THCReduce.cuh
@@ -20,17 +20,21 @@ __device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() {
 }
 
 // Kernel that handles an entire reduction of a slice of a tensor per each thread
-template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims, int BDims>
+template <typename ModifyOp,
+          typename ReduceOp,
+          typename T,
+          typename IndexType,
+          int ADims, int BDims>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(32 * 16, 4)
 #endif
 __global__ void
-THCudaTensor_reduceNoncontigDim(TensorInfo<IndexType> out,
-                                TensorInfo<IndexType> in,
+THCudaTensor_reduceNoncontigDim(TensorInfo<T, IndexType> out,
+                                TensorInfo<T, IndexType> in,
                                 IndexType reductionStride,
                                 IndexType reductionSize,
                                 IndexType totalSlices,
-                                float init,
+                                T init,
                                 ModifyOp modifyOp,
                                 ReduceOp reduceOp) {
   const IndexType sliceIndex = getReduceNoncontigDimSliceIndex<IndexType>();
@@ -42,13 +46,13 @@ THCudaTensor_reduceNoncontigDim(TensorInfo<IndexType> out,
   // Each thread picks a point in `out` and `in` for which it is
   // producing the reduction
   const IndexType outOffset =
-    IndexToOffset<IndexType, ADims>::get(sliceIndex, out);
+    IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
   const IndexType inBaseOffset =
-    IndexToOffset<IndexType, BDims>::get(sliceIndex, in);
+    IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
 
   // For each point in reductionSize, reduce into `r`
   IndexType inOffset = inBaseOffset;
-  float r = init;
+  T r = init;
 
   for (IndexType i = 0; i < reductionSize; ++i) {
     r = reduceOp(r, modifyOp(in.data[inOffset]));
@@ -67,13 +71,17 @@ __device__ __forceinline__ IndexType getReduceContigDimSliceIndex() {
 
 // Kernel that handles an entire reduction of a slice of a tensor per
 // each block
-template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims, int BDims>
+template <typename ModifyOp,
+          typename ReduceOp,
+          typename T,
+          typename IndexType,
+          int ADims, int BDims>
 __global__ void
-THCudaTensor_reduceContigDim(TensorInfo<IndexType> out,
-                             TensorInfo<IndexType> in,
+THCudaTensor_reduceContigDim(TensorInfo<T, IndexType> out,
+                             TensorInfo<T, IndexType> in,
                              IndexType reductionSize,
                              IndexType totalSlices,
-                             float init,
+                             T init,
                              ModifyOp modifyOp,
                              ReduceOp reduceOp) {
   const IndexType sliceIndex = getReduceContigDimSliceIndex<IndexType>();
@@ -84,23 +92,23 @@ THCudaTensor_reduceContigDim(TensorInfo<IndexType> out,
 
   // Get the offset in `out` for the reduction
   const IndexType outOffset =
-    IndexToOffset<IndexType, ADims>::get(sliceIndex, out);
+    IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
 
   // Get the base offset in `in` for this block's reduction
   const IndexType inBaseOffset =
-    IndexToOffset<IndexType, BDims>::get(sliceIndex, in);
+    IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
 
   // Each thread in the block will reduce some subset of elements in
   // the slice. The elements are guaranteed contiguous starting at
   // `inBaseOffset`.
-  float r = init;
+  T r = init;
   for (IndexType i = threadIdx.x; i < reductionSize; i += blockDim.x) {
     r = reduceOp(r, modifyOp(in.data[inBaseOffset + i]));
   }
 
   // Reduce within the block
-  extern __shared__ float smem[];
-  r = reduceBlock<float, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
+  extern __shared__ T smem[];
+  r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
 
   if (threadIdx.x == 0) {
     // Write out reduced value
@@ -208,86 +216,92 @@ bool THCudaTensor_reduceDim(THCState* state,
   // (or vice versa), the contiguous tensor can be collapsed to one
   // dimension, and the loop to translate the linear index to the array
   // index can be similarly collapsed. That is what this unrolling is for.
-#define HANDLE_CASE(TYPE, OUT, IN)                                      \
+#define HANDLE_CASE(T, TYPE, OUT, IN)                                   \
   if (contigReduction) {                                                \
-    THCudaTensor_reduceContigDim<ModifyOp, ReduceOp, TYPE, OUT, IN>     \
+    THCudaTensor_reduceContigDim<ModifyOp, ReduceOp, T, TYPE, OUT, IN>  \
       <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(    \
         outInfo, inInfo, reductionSize,                                 \
         (TYPE) outElements, init, modifyOp, reduceOp);                  \
   } else {                                                              \
-    THCudaTensor_reduceNoncontigDim<ModifyOp, ReduceOp, TYPE, OUT, IN>  \
+    THCudaTensor_reduceNoncontigDim<ModifyOp, ReduceOp, T, TYPE, OUT, IN> \
       <<<grid, block, 0, THCState_getCurrentStream(state)>>>(           \
         outInfo, inInfo, reductionStride, reductionSize,                \
         (TYPE) outElements, init, modifyOp, reduceOp);                  \
   }                                                                     \
 
-#define HANDLE_IN_CASE(TYPE, OUT, IN)                   \
-  {                                                     \
-    if (inInfo.isContiguous()) {                        \
-      HANDLE_CASE(TYPE, OUT, -2);                       \
-    } else {                                            \
-      switch (IN) {                                     \
-        case 1:                                         \
-          HANDLE_CASE(TYPE, OUT, 1);                    \
-          break;                                        \
-        case 2:                                         \
-          HANDLE_CASE(TYPE, OUT, 2);                    \
-          break;                                        \
-        case 3:                                         \
-          HANDLE_CASE(TYPE, OUT, 3);                    \
-          break;                                        \
-        default:                                        \
-          HANDLE_CASE(TYPE, OUT, -1);                   \
-          break;                                        \
-      }                                                 \
-    }                                                   \
+#define HANDLE_IN_CASE(T, TYPE, OUT, IN)                  \
+  {                                                       \
+    if (inInfo.isContiguous()) {                          \
+      HANDLE_CASE(T, TYPE, OUT, -2);                      \
+    } else {                                              \
+      switch (IN) {                                       \
+        case 1:                                           \
+          HANDLE_CASE(T, TYPE, OUT, 1);                   \
+          break;                                          \
+        case 2:                                           \
+          HANDLE_CASE(T, TYPE, OUT, 2);                   \
+          break;                                          \
+        default:                                          \
+          HANDLE_CASE(T, TYPE, OUT, -1);                  \
+          break;                                          \
+      }                                                   \
+    }                                                     \
   }
 
-#define HANDLE_OUT_CASE(TYPE, OUT, IN)                \
-  {                                                   \
-    if (outInfo.isContiguous()) {                     \
-      HANDLE_IN_CASE(TYPE, -2, IN);                   \
-    } else {                                          \
-      switch (OUT) {                                  \
-        case 1:                                       \
-          HANDLE_IN_CASE(TYPE, 1, IN);                \
-          break;                                      \
-        case 2:                                       \
-          HANDLE_IN_CASE(TYPE, 2, IN);                \
-          break;                                      \
-        case 3:                                       \
-          HANDLE_IN_CASE(TYPE, 3, IN);                \
-          break;                                      \
-        default:                                      \
-          HANDLE_IN_CASE(TYPE, -1, IN);               \
-          break;                                      \
-      }                                               \
-    }                                                 \
+#define HANDLE_OUT_CASE(T, TYPE, OUT, IN)              \
+  {                                                    \
+    if (outInfo.isContiguous()) {                      \
+      HANDLE_IN_CASE(T, TYPE, -2, IN);                 \
+    } else {                                           \
+      switch (OUT) {                                   \
+        case 1:                                        \
+          HANDLE_IN_CASE(T, TYPE, 1, IN);              \
+          break;                                       \
+        case 2:                                        \
+          HANDLE_IN_CASE(T, TYPE, 2, IN);              \
+          break;                                       \
+        case 3:                                        \
+          HANDLE_IN_CASE(T, TYPE, 3, IN);              \
+          break;                                       \
+        default:                                       \
+          HANDLE_IN_CASE(T, TYPE, -1, IN);             \
+          break;                                       \
+      }                                                \
+    }                                                  \
   }
 
-  if (THC_canUse32BitIndexMath(state, out) &&
-      THC_canUse32BitIndexMath(state, in)) {
-    TensorInfo<unsigned int> outInfo(state, out);
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, out) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, in)) {
+    TensorInfo<float, unsigned int> outInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, out);
     outInfo.collapseDims();
 
-    TensorInfo<unsigned int> inInfo(state, in, dim);
+    TensorInfo<float, unsigned int> inInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, in);
+    inInfo.reduceDim(dim);
     inInfo.collapseDims();
 
-    HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims);
+    HANDLE_OUT_CASE(typename TensorUtils<THCudaTensor>::DataType,
+                    unsigned int, outInfo.dims, inInfo.dims);
   } else {
-    TensorInfo<unsigned long> outInfo(state, out);
+    TensorInfo<float, unsigned long> outInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, out);
     outInfo.collapseDims();
 
-    TensorInfo<unsigned long> inInfo(state, in, dim);
+    TensorInfo<float, unsigned long> inInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, in);
+    inInfo.reduceDim(dim);
     inInfo.collapseDims();
 
     // For large tensors, we only compile the completely contiguous
     // version and the completely generic version, to reduce
     // compilation time.
     if (outInfo.isContiguous() && inInfo.isContiguous()) {
-      HANDLE_CASE(unsigned long, -2, -2);
+      HANDLE_CASE(typename TensorUtils<THCudaTensor>::DataType,
+                  unsigned long, -2, -2);
     } else {
-      HANDLE_CASE(unsigned long, -1, -1);
+      HANDLE_CASE(typename TensorUtils<THCudaTensor>::DataType,
+                  unsigned long, -1, -1);
     }
   }
 #undef HANDLE_CASE
diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh
index cfe40fd..3fe81a9 100644
--- a/lib/THC/THCReduceAll.cuh
+++ b/lib/THC/THCReduceAll.cuh
@@ -18,24 +18,28 @@
 #define THC_TWO_PASS_REDUCTION_SIZE 2048L
 
 // Kernel that handles an entire reduction of a tensor in one pass
-template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims>
+template <typename ModifyOp,
+          typename ReduceOp,
+          typename T,
+          typename IndexType,
+          int ADims>
 __global__ void
-THCudaTensor_reduceAll(TensorInfo<IndexType> in,
+THCudaTensor_reduceAll(TensorInfo<T, IndexType> in,
                        IndexType totalElements,
-                       float init,
+                       T init,
                        ModifyOp modifyOp,
                        ReduceOp reduceOp,
-                       float* out) {
+                       T* out) {
   // With a block-wide stride, have each thread perform its own reduction.
-  float r = init;
+  T r = init;
   for (IndexType i = threadIdx.x; i < totalElements; i += blockDim.x) {
-    const IndexType inOffset = IndexToOffset<IndexType, ADims>::get(i, in);
+    const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in);
     r = reduceOp(r, modifyOp(in.data[inOffset]));
   }
 
   // Reduce within the block
-  extern __shared__ float smem[];
-  r = reduceBlock<float, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
+  extern __shared__ T smem[];
+  r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
 
   if (threadIdx.x == 0) {
     // Write out reduced value
@@ -56,27 +60,31 @@ __device__ __forceinline__ IndexType getEndIndex(IndexType totalSize) {
 }
 
 // Kernel that handles an entire reduction of a tensor in two passes
-template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims>
+template <typename ModifyOp,
+          typename ReduceOp,
+          typename T,
+          typename IndexType,
+          int ADims>
 __global__ void
-THCudaTensor_reduceAllPass1(TensorInfo<IndexType> in,
+THCudaTensor_reduceAllPass1(TensorInfo<T, IndexType> in,
                             IndexType totalElements,
-                            float init,
+                            T init,
                             ModifyOp modifyOp,
                             ReduceOp reduceOp,
-                            float* scratchSpace) {
+                            T* scratchSpace) {
   const IndexType startIndex = getStartIndex<IndexType>(totalElements);
   const IndexType endIndex = getEndIndex<IndexType>(totalElements);
 
   // With a block-wide stride, have each thread perform its own reduction.
-  float r = init;
+  T r = init;
   for (IndexType i = startIndex + threadIdx.x; i < endIndex; i += blockDim.x) {
-    const IndexType inOffset = IndexToOffset<IndexType, ADims>::get(i, in);
+    const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in);
     r = reduceOp(r, modifyOp(in.data[inOffset]));
   }
 
   // Reduce within the block
-  extern __shared__ float smem[];
-  r = reduceBlock<float, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
+  extern __shared__ T smem[];
+  r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
 
   if (threadIdx.x == 0) {
     // Write out block-wide reduced value
@@ -84,21 +92,21 @@ THCudaTensor_reduceAllPass1(TensorInfo<IndexType> in,
   }
 }
 
-template <typename ReduceOp, typename IndexType>
+template <typename ReduceOp, typename T, typename IndexType>
 __global__ void
 THCudaTensor_reduceAllPass2(int numPass1Blocks,
-                            float init,
+                            T init,
                             ReduceOp reduceOp,
-                            float* scratchSpace,
-                            float* out) {
-  float r = init;
+                            T* scratchSpace,
+                            T* out) {
+  T r = init;
   if (threadIdx.x < numPass1Blocks) {
     r = scratchSpace[threadIdx.x];
   }
 
   // Reduce within the block
-  extern __shared__ float smem[];
-  r = reduceBlock<float, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init);
+  extern __shared__ T smem[];
+  r = reduceBlock<T, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init);
 
   if (threadIdx.x == 0) {
     *out = r;
@@ -111,12 +119,13 @@ inline bool isTwoPassReductionSize(long elements) {
   return (elements > THC_TWO_PASS_REDUCTION_SIZE);
 }
 
+template <typename T>
 inline long getTwoPassBlocks(THCState* state, long elements) {
   long numBlocks = THCCeilDiv(elements, THC_REDUCE_ALL_BLOCK_SIZE);
 
   // We can only have as many blocks as there is scratch space
   long scratchSpace =
-    THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(float);
+    THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(T);
   THAssert(scratchSpace > 0);
 
   if (numBlocks > scratchSpace) {
@@ -127,60 +136,67 @@ inline long getTwoPassBlocks(THCState* state, long elements) {
 }
 
 // Get the block/grid size that we want
+template <typename T>
 inline void getPass1ReduceBlockGrid(THCState* state, long elements,
                                     dim3& grid, dim3& block) {
-  grid = dim3(getTwoPassBlocks(state, elements));
+  grid = dim3(getTwoPassBlocks<T>(state, elements));
   block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
 }
 
+template <typename T>
 inline void getPass2ReduceBlockGrid(THCState* state, long elements,
                                     dim3& grid, dim3& block) {
   grid = dim3(1);
   // We only need as many threads as there were blocks originally
-  block = dim3(getTwoPassBlocks(state, elements));
+  block = dim3(getTwoPassBlocks<T>(state, elements));
 }
 
+template <typename T>
 inline void getSinglePassReduceBlockGrid(long elements,
                                          dim3& grid, dim3& block) {
   grid = dim3(1);
   block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
 }
 
-template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims>
+template <typename ModifyOp,
+          typename ReduceOp,
+          typename T,
+          typename IndexType,
+          int ADims>
 void callReduceAll(THCState* state,
-                   const TensorInfo<IndexType>& in,
+                   const TensorInfo<T, IndexType>& in,
                    long totalElements,
-                   float init,
+                   T init,
                    const ModifyOp& modifyOp,
                    const ReduceOp& reduceOp,
-                   float* devOut) {
+                   T* devOut) {
   dim3 grid;
   dim3 block;
 
   if (isTwoPassReductionSize(totalElements)) {
-    getPass1ReduceBlockGrid(state, totalElements, grid, block);
-    size_t smemSize = block.x * sizeof(float);
+    getPass1ReduceBlockGrid<T>(state, totalElements, grid, block);
+    size_t smemSize = block.x * sizeof(T);
 
-    THCudaTensor_reduceAllPass1<ModifyOp, ReduceOp, IndexType, ADims>
+    THCudaTensor_reduceAllPass1<ModifyOp, ReduceOp, T, IndexType, ADims>
       <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
         in, (IndexType) totalElements, init, modifyOp, reduceOp,
-        (float*) THCState_getCurrentDeviceScratchSpace(state));
+        (T*) THCState_getCurrentDeviceScratchSpace(state));
 
     int numPass1Blocks = grid.x;
-    getPass2ReduceBlockGrid(state, totalElements, grid, block);
-    smemSize = block.x * sizeof(float);
+    getPass2ReduceBlockGrid<T>(state, totalElements, grid, block);
+    smemSize = block.x * sizeof(T);
 
-    THCudaTensor_reduceAllPass2<ReduceOp, IndexType>
+    THCudaTensor_reduceAllPass2<ReduceOp, T, IndexType>
       <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
         numPass1Blocks, init, reduceOp,
-        (float*) THCState_getCurrentDeviceScratchSpace(state),
+        (T*) THCState_getCurrentDeviceScratchSpace(state),
         devOut);
 
   } else {
-    getSinglePassReduceBlockGrid(totalElements, grid, block);
-    size_t smemSize = block.x * sizeof(float);
+    getSinglePassReduceBlockGrid<T>(totalElements, grid, block);
+    size_t smemSize = block.x * sizeof(T);
 
-    THCudaTensor_reduceAll<ModifyOp, ReduceOp, IndexType, ADims>
+    THCudaTensor_reduceAll<ModifyOp, ReduceOp, T, IndexType, ADims>
       <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
         in, (IndexType) totalElements, init, modifyOp, reduceOp, devOut);
   }
@@ -224,8 +240,10 @@ bool THCudaTensor_reduceAll(THCState* state,
   // dimension, and the loop to translate the linear index to the array
   // index can be similarly collapsed. That is what this unrolling is for.
 #define HANDLE_CASE(TYPE, IN)                                           \
-  callReduceAll<ModifyOp, ReduceOp, TYPE, IN>(                          \
-    state, inInfo, inElements, init, modifyOp, reduceOp, devOut);
+  callReduceAll<ModifyOp, ReduceOp,                                     \
+                typename TensorUtils<THCudaTensor>::DataType,           \
+                TYPE, IN>(                                              \
+                  state, inInfo, inElements, init, modifyOp, reduceOp, devOut);
 
 #define HANDLE_IN_CASE(TYPE, IN)                    \
   {                                                 \
@@ -249,13 +267,15 @@ bool THCudaTensor_reduceAll(THCState* state,
     }                                               \
   }
 
-  if (THC_canUse32BitIndexMath(state, in)) {
-    TensorInfo<unsigned int> inInfo(state, in);
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, in)) {
+    TensorInfo<float, unsigned int> inInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, in);
     inInfo.collapseDims();
 
     HANDLE_IN_CASE(unsigned int, inInfo.dims);
   } else {
-    TensorInfo<unsigned long long> inInfo(state, in);
+    TensorInfo<float, unsigned long long> inInfo =
+      getTensorInfo<THCudaTensor, unsigned long long>(state, in);
     inInfo.collapseDims();
 
     // For large tensors, we only compile the completely contiguous
diff --git a/lib/THC/THCReduceApplyUtils.cu b/lib/THC/THCReduceApplyUtils.cu
index 10d5d89..f92bb99 100644
--- a/lib/THC/THCReduceApplyUtils.cu
+++ b/lib/THC/THCReduceApplyUtils.cu
@@ -11,29 +11,6 @@ void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg) {
   THArgCheck(dims <= MAX_CUTORCH_DIMS, arg, CUTORCH_DIM_WARNING);
 }
 
-bool THC_canUse32BitIndexMath(THCState* state, THCudaTensor* t) {
-  long elements = THCudaTensor_nElement(state, t);
-  if (elements >= UINT_MAX) {
-    return false;
-  }
-
-  long offset = 0;
-  long linearId = elements - 1;
-
-  for (int i = THCudaTensor_nDimension(state, t) - 1; i >= 0; --i) {
-    long curDimIndex = linearId % THCudaTensor_size(state, t, i);
-    long curDimOffset = curDimIndex * THCudaTensor_stride(state, t, i);
-    offset += curDimOffset;
-    linearId /= THCudaTensor_size(state, t, i);
-  }
-
-  if (offset >= UINT_MAX) {
-    return false;
-  }
-
-  return true;
-}
-
 bool THC_getGridFromTiles(long gridTiles, dim3& grid) {
   if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) {
     return false;
@@ -56,73 +33,3 @@ bool THC_getGridFromTiles(long gridTiles, dim3& grid) {
   grid = dim3(gridX, gridY, gridZ);
   return true;
 }
-
-namespace {
-
-struct SizeAndStride {
-  long size;
-  long stride;
-};
-
-int compareSizeAndStride(const void* a, const void* b) {
-  const SizeAndStride* aS = (const SizeAndStride*) a;
-  const SizeAndStride* bS = (const SizeAndStride*) b;
-
-  return aS->stride < bS->stride;
-}
-
-}
-
-bool THC_overlappingIndices(THCState* state, THCudaTensor* t) {
-  // In this function, we don't care about permutations of the
-  // size/stride arrays (transpositions).
-  // We order the size/stride arrays by stride, skipping dimensions of
-  // size 1. Strides of dimensions of size 1 don't matter, since there
-  // is only one addressing point in them.
-  // In this reordered view, the tensor is contiguous if
-  // stride[dim] == size[dim + 1] * stride[dim + 1] for all `dim`.
-  // The tensor has holes if
-  // stride[dim] > size[dim + 1] * stride[dim + 1] for one or more
-  // `dim`.
-  // The tensor has overlaps if
-  // stride[dim] < size[dim + 1] * stride[dim + 1] for one or more
-  // `dim`, or the innermost stride is 0.
-
-  // Extract size/stride arrays; only consider size >1 dims.
-  SizeAndStride info[MAX_CUTORCH_DIMS];
-
-  int dims = THCudaTensor_nDimension(state, t);
-  int nonSize1Dims = 0;
-  for (int i = 0; i < dims; ++i) {
-    long size = THCudaTensor_size(state, t, i);
-    if (size > 1) {
-      info[nonSize1Dims].size = size;
-      info[nonSize1Dims].stride = THCudaTensor_stride(state, t, i);
-      ++nonSize1Dims;
-    }
-  }
-
-  if (nonSize1Dims == 0) {
-    // no overlap
-    return false;
-  }
-
-  // Ascending order (innermost dimension in sorted view is at [0])
-  qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride);
-
-  // Base case: innermost dimension must have stride >= 1
-  if (info[nonSize1Dims - 1].stride < 1) {
-    return true;
-  }
-
-  // Subsequent dimensions, if any
-  for (int i = nonSize1Dims - 2; i >= 0; --i) {
-    if (info[i].stride < info[i + 1].size * info[i + 1].stride) {
-      // There are overlaps
-      return true;
-    }
-  }
-
-  // Tensor has holes or is contiguous
-  return false;
-}
diff --git a/lib/THC/THCReduceApplyUtils.cuh b/lib/THC/THCReduceApplyUtils.cuh
index d5ac2b5..06f969f 100644
--- a/lib/THC/THCReduceApplyUtils.cuh
+++ b/lib/THC/THCReduceApplyUtils.cuh
@@ -6,284 +6,12 @@
 #include "THCGeneral.h"
 #include "THCTensor.h"
 #include "THCDeviceUtils.cuh"
-
-// Maximum number of dimensions allowed for cutorch
-#define MAX_CUTORCH_DIMS 25
-
-// Warning string for tensor arguments that are too large or have too
-// many dimensions
-#define CUTORCH_STR(X) #X
-#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
-  CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
+#include "THCTensorInfo.cuh"
 
 // Enum that indicates whether tensor arguments are read/write or
 // read-only
 enum TensorArgType { ReadWrite, ReadOnly };
 
-// Copy operator for the pointwise apply kernel
-template <typename T>
-struct CopyOp {
-  __device__ __forceinline__ void operator()(T* dst, T* src) {
-#if __CUDA_ARCH__ >= 350
-    *dst = __ldg(src);
-#else
-    *dst = *src;
-#endif
-  }
-};
-
-// CUDA kernel argument that defines tensor layout
-template <typename IndexType>
-struct TensorInfo {
-  // Extracts size/stride information for the kernel.
-  // The optional `reduceDim` indicates a reduction dimension for the
-  // given tensor, so that the resulting size for this dimension will be 1.
-  TensorInfo(THCState* state, THCudaTensor* t, int reduceDim = -1);
-
-  // Collapses all runs of successive dimensions if the size/strides
-  // match up within the run and there are no holes between the
-  // dimensions.
-  // If excludeDim is set (not -1), then excludeDim will not be
-  // collapsed with any other dimension.
-  // Function returns the new dimension index that excludeDim maps to,
-  // since the collapsed dimensions are <= the input dimensions.
-  int collapseDims(int excludeDim = -1);
-
-  // Contiguous tensors of more than one dimension are collapsed down
-  // to one tensor
-  __host__ __device__ inline bool isContiguous() const {
-    return (dims == 1 && strides[0] == 1);
-  }
-
-  float* data;
-  IndexType sizes[MAX_CUTORCH_DIMS];
-  IndexType strides[MAX_CUTORCH_DIMS];
-  int dims;
-};
-
-template <typename IndexType>
-TensorInfo<IndexType>::TensorInfo(THCState* state,
-                                  THCudaTensor* t,
-                                  int reduceDim) {
-  data = THCudaTensor_data(state, t);
-  dims = THCudaTensor_nDimension(state, t);
-  assert(dims <= MAX_CUTORCH_DIMS);
-
-  for (int i = 0; i < dims; ++i) {
-    sizes[i] = THCudaTensor_size(state, t, i);
-    strides[i] = THCudaTensor_stride(state, t, i);
-  }
-
-  assert(reduceDim == -1 || (reduceDim < dims && reduceDim >= 0));
-
-  if (reduceDim != -1) {
-    sizes[reduceDim] = 1;
-  }
-}
-
-template <typename IndexType>
-int
-TensorInfo<IndexType>::collapseDims(int excludeDim) {
-  // Find the innermost dimension not of size 1, since dimensions of size 1 are
-  // collapsible.
-  int firstNonOneDim = -1;
-
-  for (int i = dims - 1; i >= 0; --i) {
-    if (i == excludeDim) {
-      // We cannot collapse this dimension, even if it is size 1
-      firstNonOneDim = i;
-      break;
-    }
-
-    if (sizes[i] != 1) {
-      firstNonOneDim = i;
-      break;
-    }
-  }
-
-  // Special case: if all dimensions are of size 1, then this is a
-  // single-point tensor that we still have to operate on. Reduce to a
-  // single point.
-  if (firstNonOneDim == -1) {
-    assert(excludeDim == -1);
-
-    dims = 1;
-    sizes[0] = 1;
-    strides[0] = 1;
-
-    // Everything effectively got collapsed into this dimension
-    return 0;
-  }
-
-  // Count the number of successive dimensions that can be collapsed, from
-  // innermost to outermost.
-  int numCollapsed = 0;
-
-  // Skip the leading size 1 dims
-  numCollapsed += dims - 1 - firstNonOneDim;
-
-  // We perform one pass through to determine how many dimensions we
-  // can collapse, before calculating the actual size of the collapsed
-  // dimensions.
-  // size/strideInner are the size/strides of the previous inner
-  // non-collapsible dim we encounter.
-  long sizeInner = sizes[firstNonOneDim];
-  long strideInner = strides[firstNonOneDim];
-
-  for (int i = firstNonOneDim - 1; i >= 0; --i) {
-    long sizeOuter = sizes[i];
-    long strideOuter = strides[i];
-
-    // Don't collapse this dimension if we want to exclude it from
-    // collapsing.
-    // Since this code is attempting to collapse a subsequent
-    // dimension (i) with the preceding dimension (i + 1), we can only
-    // perform collapsing if the preceding dimension can be collapsed
-    // (i.e., not excludeDim)
-    if ((excludeDim != i) && (excludeDim != i + 1)) {
-      // The next outermost dimension can be skipped if size 1
-      if (sizeOuter == 1) {
-        ++numCollapsed;
-        continue;
-      }
-
-      // If the next outermost dimension is contiguous with the
-      // previous non-collapsed one, collapse it
-      if (strideOuter == strideInner * sizeInner) {
-        ++numCollapsed;
-
-        // This is the run of collapsed dimensions' size
-        sizeInner = sizeInner * sizeOuter;
-        continue;
-      }
-    }
-
-    // Otherwise, this new outer dimension at `i` cannot be collapsed
-    // because it is excluded from collapsing, or it is not contiguous
-    // with the previous inner dimension.
-    sizeInner = sizeOuter;
-    strideInner = strideOuter;
-  }
-
-  // This will be our new size/stride and dimension.
-  IndexType newSizes[MAX_CUTORCH_DIMS];
-  IndexType newStrides[MAX_CUTORCH_DIMS];
-
-  assert(numCollapsed < dims);
-  int newDims = dims - numCollapsed;
-
-  // We return the index of the excluded dimension that is excluded
-  // from being collapsed here.
-  int returnDim = -1;
-
-  // We perform a second pass through the dimensions to actually
-  // calculate the size of the collapsed dimensions.
-  int collapsedIndex = dims - numCollapsed - 1;
-  newSizes[collapsedIndex] = sizes[firstNonOneDim];
-  newStrides[collapsedIndex] = strides[firstNonOneDim];
-
-  if (firstNonOneDim == excludeDim) {
-    returnDim = collapsedIndex;
-  }
-
-  for (int i = firstNonOneDim - 1; i >= 0; --i) {
-    IndexType sizeOuter = sizes[i];
-    IndexType strideOuter = strides[i];
-
-    if ((excludeDim != i) && (excludeDim != i + 1)) {
-      if (sizeOuter == 1) {
-        // skip
-        continue;
-      }
-
-      if (strideOuter == newSizes[collapsedIndex] * newStrides[collapsedIndex]) {
-        // collapse
-        newSizes[collapsedIndex] *= sizeOuter;
-        continue;
-      }
-    }
-
-    // Otherwise, strides don't match, or dim `i` is excluded from
-    // collapsing.
-    --collapsedIndex;
-    assert(collapsedIndex >= 0);
-    assert(collapsedIndex < newDims);
-    newSizes[collapsedIndex] = sizeOuter;
-    newStrides[collapsedIndex] = strideOuter;
-
-    if (excludeDim == i) {
-      returnDim = collapsedIndex;
-    }
-  }
-
-  // We must have filled all the dimensions we're looking for
-  assert(collapsedIndex == 0);
-  assert((excludeDim == -1) || (returnDim != -1));
-
-  dims = newDims;
-
-  for (int i = 0; i < dims; ++i) {
-    sizes[i] = newSizes[i];
-    strides[i] = newStrides[i];
-  }
-
-  // After collapsing, the original `excludeDim` may have been
-  // renumbered to this new `returnDim`, since some dimensions could
-  // have been collapsed.
-  return returnDim;
-}
-
-// Translate a linear index for the apply to a float* offset;
-// specialized on `Dims` to reduce nvcc compilation time
-template <typename IndexType, int Dims>
-struct IndexToOffset {
-  static __host__ __device__ IndexType get(
-    IndexType linearId,
-    const TensorInfo<IndexType>& info) {
-    IndexType offset = 0;
-
-    // Use static dims
-    for (int i = Dims - 1; i >= 0; --i) {
-      IndexType curDimIndex = linearId % info.sizes[i];
-      IndexType curDimOffset = curDimIndex * info.strides[i];
-      offset += curDimOffset;
-
-      if (i > 0) {
-        linearId /= info.sizes[i];
-      }
-    }
-
-    return offset;
-  }
-};
-
-template <typename IndexType>
-struct IndexToOffset<IndexType, -2> {
-  static __forceinline__ __host__ __device__ IndexType
-    get(IndexType linearId, const TensorInfo<IndexType>& info) {
-    return linearId;
-  }
-};
-
-template <typename IndexType>
-struct IndexToOffset<IndexType, -1> {
-  static __forceinline__ __host__ __device__ IndexType
-    get(IndexType linearId, const TensorInfo<IndexType>& info) {
-    IndexType offset = 0;
-
-    // Use dynamic dims
-    for (int i = info.dims - 1; i >= 0; --i) {
-      IndexType curDimIndex = linearId % info.sizes[i];
-      IndexType curDimOffset = curDimIndex * info.strides[i];
-      offset += curDimOffset;
-
-      linearId /= info.sizes[i];
-    }
-
-    return offset;
-  }
-};
-
 template <typename IndexType>
 __device__ __forceinline__ IndexType getLinearBlockId() {
   return blockIdx.z * gridDim.y * gridDim.x +
@@ -347,16 +75,7 @@ __device__ T reduceBlock(T* smem,
 // Make sure the given tensor doesn't have too many dimensions
 void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg);
 
-// Returns true if all linear ID -> offset math can be performed using 32 bit
-// unsigned math, which is faster than 64 bit math
-THC_API bool THC_canUse32BitIndexMath(THCState* state, THCudaTensor* t);
-
 // Produces a grid with at least one point per tile
 THC_API bool THC_getGridFromTiles(long gridTiles, dim3& grid);
 
-// Determines if the given tensor has overlapping data points (i.e.,
-// is there more than one index into the tensor that references the
-// same piece of data)?
-THC_API bool THC_overlappingIndices(THCState* state, THCudaTensor* t);
-
 #endif // THC_REDUCE_APPLY_UTILS_INC
diff --git a/lib/THC/THCSortUtils.cuh b/lib/THC/THCSortUtils.cuh
index 42a8018..baf4a8e 100644
--- a/lib/THC/THCSortUtils.cuh
+++ b/lib/THC/THCSortUtils.cuh
@@ -2,6 +2,7 @@
 #define THC_SORT_UTILS_INC
 
 #include "THCReduceApplyUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
 
 // Collection of kernel sort routines
 template <typename T>
@@ -91,11 +92,11 @@ template <typename K, typename V,
           int KeyDims, int ValueDims,
           typename Comparator, typename IndexType, int Power2SortSize>
 __global__ void
-bitonicSortKVInPlace(TensorInfo<IndexType> keys,
+bitonicSortKVInPlace(TensorInfo<K, IndexType> keys,
                      IndexType keySlices,
                      IndexType keySliceSize,
                      IndexType keySliceStride,
-                     TensorInfo<IndexType> values,
+                     TensorInfo<V, IndexType> values,
                      IndexType valueSliceStride,
                      const Comparator& comp) {
   // Find the slice of the tensor that we are sorting
@@ -111,9 +112,9 @@ bitonicSortKVInPlace(TensorInfo<IndexType> keys,
   __shared__ bool sharedValid[Power2SortSize];
 
   const IndexType keyStartOffset =
-    IndexToOffset<IndexType, KeyDims>::get(linearIndex, keys);
+    IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys);
   const IndexType valueStartOffset =
-    IndexToOffset<IndexType, ValueDims>::get(linearIndex, values);
+    IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values);
 
   // If the sort size is 1, the data is already sorted
   if (Power2SortSize == 1) {
diff --git a/lib/THC/THCStorageCopy.h b/lib/THC/THCStorageCopy.h
index ec8011d..837056f 100644
--- a/lib/THC/THCStorageCopy.h
+++ b/lib/THC/THCStorageCopy.h
@@ -3,6 +3,7 @@
 
 #include "THCStorage.h"
 #include "THCGeneral.h"
+#include "THCHalf.h"
 
 #include "generic/THCStorageCopy.h"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu
index f6545a6..ea38d2e 100644
--- a/lib/THC/THCTensorCopy.cu
+++ b/lib/THC/THCTensorCopy.cu
@@ -7,5 +7,226 @@ inline int curGPU() {
   return curDev;
 }
 
+// Copy operator for the pointwise apply kernel
+template <typename TypeDst, typename TypeSrc>
+struct CopyOp {
+  __device__ __forceinline__ void operator()(TypeDst* dst, TypeSrc* src) {
+#if __CUDA_ARCH__ >= 350
+    *dst = ScalarConvert<TypeSrc, TypeDst>::to(__ldg(src));
+#else
+    *dst = ScalarConvert<TypeSrc, TypeDst>::to(*src);
+#endif
+  }
+};
+
+// Copy for the same type to the same type
+template <typename TensorTypeDst, typename TensorTypeSrc>
+void
+THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) {
+  long totalElements = TensorUtils<TensorTypeDst>::getNumElements(state, dst);
+
+  THArgCheck(totalElements ==
+             TensorUtils<TensorTypeSrc>::getNumElements(state, src),
+             2, "sizes do not match");
+
+  if (TensorUtils<TensorTypeDst>::getDims(state, dst) == 0) {
+    // Zero-dim tensor; copy nothing
+    return;
+  }
+
+  // We can memcpy the memory if:
+  // -both tensors are contiguous; or,
+  // -there is only one element to copy; or,
+  // -FIXME: if both tensors have matching size and stride arrays, and no
+  // holes within (in other words, there is some permutation that can be applied
+  // to the size/strides such that the resulting tensor is
+  // contiguous).
+  // -AND: both tensors have the same type.
+  bool sameType = isSameType<TensorTypeSrc, TensorTypeDst>();
+  bool srcContig = TensorUtils<TensorTypeSrc>::isContiguous(state, src);
+  bool dstContig = TensorUtils<TensorTypeDst>::isContiguous(state, dst);
+  bool memcpyEligible =
+    ((srcContig && dstContig) || (totalElements == 1)) && sameType;
+
+
+  int srcDev = TensorUtils<TensorTypeSrc>::getDevice(state, src);
+  int dstDev = TensorUtils<TensorTypeDst>::getDevice(state, dst);
+  int oldDev = curGPU();
+
+  // We always perform the copy on the source device, using the
+  // current stream on the source device.
+  // If the copy is on the default stream, then we fully synchronize
+  // both src and dst's default streams for completion of the
+  // copy. We have to explicitly do this for non-contig copies.
+  // This mimics the behavior of cross-device cudaMemcpyAsync on
+  // the default stream.
+  // If the copy is not on the default stream, then it is up to the
+  // user to add needed synchronization on the dst device, since the
+  // stream on the dst device that wishes to synchronize may not be
+  // the same index as the one on the src device.
+  int copyStreamIndex =
+    THCState_getCurrentStreamIndex(state);
+  cudaStream_t copyStream =
+    THCState_getDeviceStream(state, srcDev, copyStreamIndex);
+
+  if (srcDev != dstDev && copyStreamIndex == 0) {
+    // This is a cross-device copy on the default stream. We perform a
+    // two-way barrier between both devices' default streams before
+    // the copy. This ensures that any write-after-write and
+    // write-after-read dependencies on the destination side are
+    // handled, so that no one is operating on the dst memory when
+    // we perform the copy.
+    // src waits on dst barrier (src already waits on src)
+    cudaEvent_t dstReady;
+    THCudaCheck(cudaSetDevice(dstDev));
+    THCudaCheck(cudaEventCreateWithFlags(&dstReady, cudaEventDisableTiming));
+    THCudaCheck(cudaEventRecord(dstReady, NULL));
+
+    THCudaCheck(cudaSetDevice(srcDev));
+    THCudaCheck(cudaStreamWaitEvent(NULL, dstReady, 0));
+    THCudaCheck(cudaEventDestroy(dstReady));
+  } else if (srcDev != oldDev) {
+    THCudaCheck(cudaSetDevice(srcDev));
+  }
+
+  // We are now on srcDev
+  if (memcpyEligible) {
+    // Perform the copy
+    THCudaCheck(cudaMemcpyAsync(
+                  TensorUtils<TensorTypeDst>::getData(state, dst),
+                  TensorUtils<TensorTypeSrc>::getData(state, src),
+                  totalElements *
+                  sizeof(typename TensorUtils<TensorTypeDst>::DataType),
+                  cudaMemcpyDeviceToDevice,
+                  copyStream));
+  } else {
+    // Non-contiguous copy or a type-conversion copy
+
+    // We avoid creating temporary memory copies if possible.
+    // If both src and dst are on the same device, or if they are on
+    // different devices and p2p access is enabled, perform the copy
+    // by a pointwise copy kernel.
+    // Otherwise, we'll have to make contiguous (which will in fact
+    // invoke copy() again), and then perform the copy.
+    // FIXME: might want to consider only running the pointwise kernel
+    // if both src and dst innermost dimensions are contiguous. If
+    // they are not, then taking the hit of the memory allocation/free
+    // might be worth it to avoid non-coalesced reads or writes.
+
+    // A device always has access to itself, so this also handles the
+    // case srcDev == dstDev
+    if (THCState_getPeerToPeerAccess(state, srcDev, dstDev)) {
+      // Make sure we have the current stream set in THCState, since
+      // pointwise uses that
+      if (srcDev != oldDev) {
+        THCState_setStream(state, srcDev, copyStreamIndex);
+      }
+
+      bool succ =
+        THC_pointwiseApply2(
+          state, dst, src,
+          CopyOp<typename TensorUtils<TensorTypeDst>::DataType,
+                 typename TensorUtils<TensorTypeSrc>::DataType>());
+
+      // Restore prior THCState stream
+      if (srcDev != oldDev) {
+        THCState_setStream(state, oldDev, copyStreamIndex);
+      }
+
+      THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
+    } else {
+      // GPUs can't access each other directly, but the tensors
+      // involved are non-contiguous and/or are different types.
+
+      // Make sure the src is contiguous and in the same type as dst
+      THCudaCheck(cudaSetDevice(srcDev));
+      TensorTypeDst* srcContig = NULL;
+
+      if (sameType) {
+        srcContig =
+          (TensorTypeDst*) // this is actually the same type as src
+          TensorUtils<TensorTypeSrc>::newContiguous(state, src);
+
+      } else {
+        // Types are different
+        // Copy into the new format, contiguous, on the source device
+        srcContig = TensorUtils<TensorTypeDst>::newTensor(state);
+        TensorUtils<TensorTypeDst>::resizeAs(state, srcContig, dst);
+
+        if (srcDev != oldDev) {
+          THCState_setStream(state, srcDev, copyStreamIndex);
+        }
+
+        bool succ =
+          THC_pointwiseApply2(
+            state, srcContig, src,
+            CopyOp<typename TensorUtils<TensorTypeDst>::DataType,
+                   typename TensorUtils<TensorTypeSrc>::DataType>());
+
+        // Restore prior THCState stream
+        if (srcDev != oldDev) {
+          THCState_setStream(state, oldDev, copyStreamIndex);
+        }
+
+        THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
+      }
+
+      // Make sure the dst is contiguous
+      THCudaCheck(cudaSetDevice(dstDev));
+      TensorTypeDst* dstContig =
+        TensorUtils<TensorTypeDst>::newContiguous(state, dst);
+
+      // Now, we are ready for a cross-device memcpy of contiguous
+      // data, of the same layout and type
+      THCudaCheck(cudaSetDevice(srcDev));
+
+      THCudaCheck(cudaMemcpyAsync(
+                    TensorUtils<TensorTypeDst>::getData(state, dstContig),
+                    TensorUtils<TensorTypeDst>::getData(state, srcContig),
+                    totalElements *
+                    sizeof(typename TensorUtils<TensorTypeDst>::DataType),
+                    cudaMemcpyDeviceToDevice,
+                    copyStream));
+
+      // We are done with the src
+      TensorUtils<TensorTypeDst>::free(state, srcContig);
+
+      if (dst != dstContig) {
+        TensorUtils<TensorTypeDst>::freeCopyTo(state, dstContig, dst);
+      } else {
+        TensorUtils<TensorTypeDst>::free(state, dstContig);
+      }
+
+      // We're still on srcDev at this point
+    }
+  }
+
+  if (srcDev != dstDev && copyStreamIndex == 0) {
+    // dst waits on src barrier (dst already waits on dst). We cannot
+    // operate on dst's copy until the copy is complete.
+
+    // Still on srcDev, record default stream event
+    cudaEvent_t srcReady;
+    THCudaCheck(cudaEventCreateWithFlags(&srcReady, cudaEventDisableTiming));
+    THCudaCheck(cudaEventRecord(srcReady, NULL));
+
+    THCudaCheck(cudaSetDevice(dstDev));
+    THCudaCheck(cudaStreamWaitEvent(NULL, srcReady, 0));
+    THCudaCheck(cudaEventDestroy(srcReady));
+
+    // We are now on dstDev (right above). Restore prior device from dst
+    if (dstDev != oldDev) {
+      THCudaCheck(cudaSetDevice(oldDev));
+    }
+  } else {
+    // We are still on srcDev. Restore prior device from src
+    if (srcDev != oldDev) {
+      THCudaCheck(cudaSetDevice(oldDev));
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
 #include "generic/THCTensorCopy.cu"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h
index fc206cb..e8bc4f4 100644
--- a/lib/THC/THCTensorCopy.h
+++ b/lib/THC/THCTensorCopy.h
@@ -3,6 +3,7 @@
 
 #include "THCTensor.h"
 #include "THCGeneral.h"
+#include "THCHalf.h"
 
 #include "generic/THCTensorCopy.h"
 #include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu
index a23daa9..e0d721f 100644
--- a/lib/THC/THCTensorIndex.cu
+++ b/lib/THC/THCTensorIndex.cu
@@ -15,13 +15,13 @@
 // indexCopyLargeIndex kernel is a better choice to increase
 // parallelism.
 template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexCopySmallIndex(TensorInfo<IndexType> dst,
-                                      TensorInfo<IndexType> src,
-                                      TensorInfo<IndexType> indices,
-                                      int dstCopyDim,
-                                      int srcCopyDim,
-                                      IndexType innerSize,
-                                      long dstCopyDimSize) {
+__global__ void indexCopySmallIndex(TensorInfo<float, IndexType> dst,
+                                    TensorInfo<float, IndexType> src,
+                                    TensorInfo<float, IndexType> indices,
+                                    int dstCopyDim,
+                                    int srcCopyDim,
+                                    IndexType innerSize,
+                                    long dstCopyDimSize) {
   // In order to avoid reloading the index that we are copying, load
   // it once to handle all of the points that are being selected, so
   // it can be reused as much as possible. This kernel is chosen when
@@ -30,7 +30,7 @@ __global__ void indexCopySmallIndex(TensorInfo<IndexType> dst,
   for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
     // Lua indices begin at 1
     IndexType dstIndex =
-      indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1;
+      indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1;
 
     if (dstIndex < dstCopyDimSize) {
       // We stride over the output ignoring the indexed dimension
@@ -39,12 +39,12 @@ __global__ void indexCopySmallIndex(TensorInfo<IndexType> dst,
            linearIndex < innerSize;
            linearIndex += gridDim.x * blockDim.x) {
         IndexType dstOffset =
-          IndexToOffset<IndexType, DstDim>::get(linearIndex, dst);
+          IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst);
 
         dstOffset += dstIndex * dst.strides[dstCopyDim];
 
         IndexType srcOffset =
-          IndexToOffset<IndexType, SrcDim>::get(linearIndex, src);
+          IndexToOffset<float, IndexType, SrcDim>::get(linearIndex, src);
         srcOffset += srcIndex * src.strides[srcCopyDim];
 
         dst.data[dstOffset] = src.data[srcOffset];
@@ -60,13 +60,13 @@ __global__ void indexCopySmallIndex(TensorInfo<IndexType> dst,
 // indexCopySmallIndex kernel is a better choice to reduce memory
 // accesses.
 template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexCopyLargeIndex(TensorInfo<IndexType> dst,
-                                      TensorInfo<IndexType> src,
-                                      TensorInfo<IndexType> indices,
-                                      int dstCopyDim,
-                                      int srcCopyDim,
-                                      IndexType innerSize,
-                                      long dstCopyDimSize) {
+__global__ void indexCopyLargeIndex(TensorInfo<float, IndexType> dst,
+                                    TensorInfo<float, IndexType> src,
+                                    TensorInfo<float, IndexType> indices,
+                                    int dstCopyDim,
+                                    int srcCopyDim,
+                                    IndexType innerSize,
+                                    long dstCopyDimSize) {
   // We stride over the output including the indexed dimension
   // (totalSize), and calculate the destination index point based on that
   for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
@@ -77,15 +77,15 @@ __global__ void indexCopyLargeIndex(TensorInfo<IndexType> dst,
 
     // Lua indices begin at 1
     IndexType dstIndex =
-      indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1;
+      indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1;
 
     if (dstIndex < dstCopyDimSize) {
       IndexType dstOffset =
-        IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst);
+        IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst);
       dstOffset += dstIndex * dst.strides[dstCopyDim];
 
       IndexType srcOffset =
-        IndexToOffset<IndexType, SrcDim>::get(elementInSlice, src);
+        IndexToOffset<float, IndexType, SrcDim>::get(elementInSlice, src);
       srcOffset += srcIndex * src.strides[srcCopyDim];
 
       dst.data[dstOffset] = src.data[srcOffset];
@@ -138,13 +138,13 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT
   int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
 
 #define SMALL_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM)            \
-  indexCopySmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM>        \
+  indexCopySmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM>          \
     <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(           \
       dstInfo, srcInfo, indicesInfo,                            \
       dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize);
 
 #define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM)            \
-  indexCopyLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM>        \
+  indexCopyLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM>          \
     <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(           \
       dstInfo, srcInfo, indicesInfo,                            \
       dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize);
@@ -155,18 +155,21 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT
   dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, 128L), (long)(mpc * 8)));
   dim3 largeIndexBlock(std::min(srcTotalSize, 128L));
 
-  if (THC_canUse32BitIndexMath(state, dst) &&
-      THC_canUse32BitIndexMath(state, src) &&
-      THC_canUse32BitIndexMath(state, indices)) {
-    TensorInfo<unsigned int> dstInfo(state, dst);
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) {
+    TensorInfo<float, unsigned int> dstInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, dst);
     int dstCopyDim = dstInfo.collapseDims(dim);
-    dstInfo.sizes[dstCopyDim] = 1;
+    dstInfo.reduceDim(dstCopyDim);
 
-    TensorInfo<unsigned int> srcInfo(state, src);
+    TensorInfo<float, unsigned int> srcInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, src);
     int srcCopyDim = srcInfo.collapseDims(dim);
-    srcInfo.sizes[srcCopyDim] = 1;
+    srcInfo.reduceDim(srcCopyDim);
 
-    TensorInfo<unsigned int> indicesInfo(state, indices);
+    TensorInfo<float, unsigned int> indicesInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, indices);
     indicesInfo.collapseDims();
 
     // A reasonable choice for when to have each thread iterate over
@@ -193,15 +196,18 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT
       }
     }
   } else {
-    TensorInfo<unsigned long> dstInfo(state, dst);
+    TensorInfo<float, unsigned long> dstInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, dst);
     int dstCopyDim = dstInfo.collapseDims(dim);
-    dstInfo.sizes[dstCopyDim] = 1;
+    dstInfo.reduceDim(dstCopyDim);
 
-    TensorInfo<unsigned long> srcInfo(state, src);
+    TensorInfo<float, unsigned long> srcInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, src);
     int srcCopyDim = srcInfo.collapseDims(dim);
-    srcInfo.sizes[srcCopyDim] = 1;
+    srcInfo.reduceDim(srcCopyDim);
 
-    TensorInfo<unsigned long> indicesInfo(state, indices);
+    TensorInfo<float, unsigned long> indicesInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, indices);
     indicesInfo.collapseDims();
 
     LARGE_INDEX(unsigned long, -1, -1, -1);
@@ -218,13 +224,13 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT
 // indexAddLargeIndex kernel is a better choice to increase
 // parallelism.
 template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexAddSmallIndex(TensorInfo<IndexType> dst,
-                                      TensorInfo<IndexType> src,
-                                      TensorInfo<IndexType> indices,
-                                      int dstAddDim,
-                                      int srcAddDim,
-                                      IndexType innerSize,
-                                      long dstAddDimSize) {
+__global__ void indexAddSmallIndex(TensorInfo<float, IndexType> dst,
+                                   TensorInfo<float, IndexType> src,
+                                   TensorInfo<float, IndexType> indices,
+                                   int dstAddDim,
+                                   int srcAddDim,
+                                   IndexType innerSize,
+                                   long dstAddDimSize) {
   // In order to avoid reloading the index that we are copying, load
   // it once to handle all of the points that are being selected, so
   // it can be reused as much as possible. This kernel is chosen when
@@ -233,7 +239,7 @@ __global__ void indexAddSmallIndex(TensorInfo<IndexType> dst,
   for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
     // Lua indices begin at 1
     IndexType dstIndex =
-      indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1;
+      indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1;
 
     if (dstIndex < dstAddDimSize) {
       // We stride over the output ignoring the indexed dimension
@@ -242,11 +248,11 @@ __global__ void indexAddSmallIndex(TensorInfo<IndexType> dst,
            linearIndex < innerSize;
            linearIndex += gridDim.x * blockDim.x) {
         IndexType dstOffset =
-          IndexToOffset<IndexType, DstDim>::get(linearIndex, dst);
+          IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst);
         dstOffset += dstIndex * dst.strides[dstAddDim];
 
         IndexType srcOffset =
-          IndexToOffset<IndexType, SrcDim>::get(linearIndex, src);
+          IndexToOffset<float, IndexType, SrcDim>::get(linearIndex, src);
         srcOffset += srcIndex * src.strides[srcAddDim];
 
         atomicAdd(&dst.data[dstOffset], src.data[srcOffset]);
@@ -262,32 +268,32 @@ __global__ void indexAddSmallIndex(TensorInfo<IndexType> dst,
 // indexAddSmallIndex kernel is a better choice to reduce memory
 // accesses.
 template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexAddLargeIndex(TensorInfo<IndexType> dst,
-                                      TensorInfo<IndexType> src,
-                                      TensorInfo<IndexType> indices,
-                                      int dstAddDim,
-                                      int srcAddDim,
-                                      IndexType innerSize,
-                                      long dstAddDimSize) {
+__global__ void indexAddLargeIndex(TensorInfo<float, IndexType> dst,
+                                   TensorInfo<float, IndexType> src,
+                                   TensorInfo<float, IndexType> indices,
+                                   int dstAddDim,
+                                   int srcAddDim,
+                                   IndexType innerSize,
+                                   long dstAddDimSize) {
   // We stride over the output including the indexed dimension
   // (totalSize), and calculate the destination index point based on that
   for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
-    linearIndex < innerSize * indices.sizes[0];
+       linearIndex < innerSize * indices.sizes[0];
        linearIndex += gridDim.x * blockDim.x) {
     IndexType srcIndex = linearIndex / innerSize;
     IndexType elementInSlice = linearIndex % innerSize;
 
     // Lua indices begin at 1
     IndexType dstIndex =
-      indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1;
+      indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1;
 
     if (dstIndex < dstAddDimSize) {
       IndexType dstOffset =
-        IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst);
+        IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst);
       dstOffset += dstIndex * dst.strides[dstAddDim];
 
       IndexType srcOffset =
-        IndexToOffset<IndexType, SrcDim>::get(elementInSlice, src);
+        IndexToOffset<float, IndexType, SrcDim>::get(elementInSlice, src);
       srcOffset += srcIndex * src.strides[srcAddDim];
 
       atomicAdd(&dst.data[dstOffset], src.data[srcOffset]);
@@ -339,16 +345,16 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe
 
   int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
 
-#define SMALL_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM)            \
-  indexAddSmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM>           \
-    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(           \
-      dstInfo, srcInfo, indicesInfo,                            \
+#define SMALL_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM)    \
+  indexAddSmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM>   \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(   \
+      dstInfo, srcInfo, indicesInfo,                    \
       dstAddDim, srcAddDim, sliceSize, dstAddDimSize);
 
-#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM)            \
-  indexAddLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM>           \
-    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(           \
-      dstInfo, srcInfo, indicesInfo,                            \
+#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM)    \
+  indexAddLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM>   \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(   \
+      dstInfo, srcInfo, indicesInfo,                    \
       dstAddDim, srcAddDim, sliceSize, dstAddDimSize);
 
   dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8)));
@@ -357,18 +363,21 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe
   dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, 128L), (long)(mpc * 8)));
   dim3 largeIndexBlock(std::min(srcTotalSize, 128L));
 
-  if (THC_canUse32BitIndexMath(state, dst) &&
-      THC_canUse32BitIndexMath(state, src) &&
-      THC_canUse32BitIndexMath(state, indices)) {
-    TensorInfo<unsigned int> dstInfo(state, dst);
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) {
+    TensorInfo<float, unsigned int> dstInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, dst);
     int dstAddDim = dstInfo.collapseDims(dim);
-    dstInfo.sizes[dstAddDim] = 1;
+    dstInfo.reduceDim(dstAddDim);
 
-    TensorInfo<unsigned int> srcInfo(state, src);
+    TensorInfo<float, unsigned int> srcInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, src);
     int srcAddDim = srcInfo.collapseDims(dim);
-    srcInfo.sizes[srcAddDim] = 1;
+    srcInfo.reduceDim(srcAddDim);
 
-    TensorInfo<unsigned int> indicesInfo(state, indices);
+    TensorInfo<float, unsigned int> indicesInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, indices);
     indicesInfo.collapseDims();
 
     // A reasonable choice for when to have each thread iterate over
@@ -395,15 +404,18 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe
       }
     }
   } else {
-    TensorInfo<unsigned long> dstInfo(state, dst);
+    TensorInfo<float, unsigned long> dstInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, dst);
     int dstAddDim = dstInfo.collapseDims(dim);
-    dstInfo.sizes[dstAddDim] = 1;
+    dstInfo.reduceDim(dstAddDim);
 
-    TensorInfo<unsigned long> srcInfo(state, src);
+    TensorInfo<float, unsigned long> srcInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, src);
     int srcAddDim = srcInfo.collapseDims(dim);
-    srcInfo.sizes[srcAddDim] = 1;
+    srcInfo.reduceDim(srcAddDim);
 
-    TensorInfo<unsigned long> indicesInfo(state, indices);
+    TensorInfo<float, unsigned long> indicesInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, indices);
     indicesInfo.collapseDims();
 
     LARGE_INDEX(unsigned long, -1, -1, -1);
@@ -420,12 +432,12 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe
 // indexFillLargeIndex kernel is a better choice to increase
 // parallelism.
 template <typename IndexType, int DstDim, int IdxDim>
-__global__ void indexFillSmallIndex(TensorInfo<IndexType> dst,
-                                      TensorInfo<IndexType> indices,
-                                      int dstFillDim,
-                                      IndexType innerSize,
-                                      long dstFillDimSize,
-                                      float val) {
+__global__ void indexFillSmallIndex(TensorInfo<float, IndexType> dst,
+                                    TensorInfo<float, IndexType> indices,
+                                    int dstFillDim,
+                                    IndexType innerSize,
+                                    long dstFillDimSize,
+                                    float val) {
   // In order to avoid reloading the index that we are copying, load
   // it once to handle all of the points that are being selected, so
   // it can be reused as much as possible. This kernel is chosen when
@@ -434,7 +446,7 @@ __global__ void indexFillSmallIndex(TensorInfo<IndexType> dst,
   for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
     // Lua indices begin at 1
     IndexType dstIndex_ =
-      indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1;
+      indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1;
 
     if (dstIndex < dstFillDimSize) {
       // We stride over the output ignoring the indexed dimension
@@ -443,7 +455,7 @@ __global__ void indexFillSmallIndex(TensorInfo<IndexType> dst,
            linearIndex < innerSize;
            linearIndex += gridDim.x * blockDim.x) {
         IndexType dstOffset =
-          IndexToOffset<IndexType, DstDim>::get(linearIndex, dst);
+          IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst);
         dstOffset += dstIndex_ * dst.strides[dstFillDim];
 
         dst.data[dstOffset] = val;
@@ -459,27 +471,27 @@ __global__ void indexFillSmallIndex(TensorInfo<IndexType> dst,
 // indexFillSmallIndex kernel is a better choice to reduce memory
 // accesses.
 template <typename IndexType, int DstDim, int IdxDim>
-__global__ void indexFillLargeIndex(TensorInfo<IndexType> dst,
-                                      TensorInfo<IndexType> indices,
-                                      int dstFillDim,
-                                      IndexType innerSize,
-                                      long dstFillDimSize,
-                                      float val) {
+__global__ void indexFillLargeIndex(TensorInfo<float, IndexType> dst,
+                                    TensorInfo<float, IndexType> indices,
+                                    int dstFillDim,
+                                    IndexType innerSize,
+                                    long dstFillDimSize,
+                                    float val) {
   // We stride over the output including the indexed dimension
   // (totalSize), and calculate the destination index point based on that
   for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
-    linearIndex < innerSize * indices.sizes[0];
+       linearIndex < innerSize * indices.sizes[0];
        linearIndex += gridDim.x * blockDim.x) {
     IndexType dstIndex = linearIndex / innerSize;
     IndexType elementInSlice = linearIndex % innerSize;
 
     // Lua indices begin at 1
     IndexType dstIndex_ =
-      indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1;
+      indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1;
 
     if (dstIndex_ < dstFillDimSize) {
       IndexType dstOffset =
-        IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst);
+        IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst);
       dstOffset += dstIndex_ * dst.strides[dstFillDim];
 
       dst.data[dstOffset] = val;
@@ -528,16 +540,16 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT
 
   int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
 
-#define SMALL_INDEX(TYPE, DST_DIM, IDX_DIM)            \
-  indexFillSmallIndex<TYPE, DST_DIM, IDX_DIM>          \
-    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(  \
-      dstInfo, indicesInfo,                            \
+#define SMALL_INDEX(TYPE, DST_DIM, IDX_DIM)             \
+  indexFillSmallIndex<TYPE, DST_DIM, IDX_DIM>           \
+    <<<smallIndexGrid, smallIndexBlock, 0, stream>>>(   \
+      dstInfo, indicesInfo,                             \
       dstFillDim, sliceSize, dstFillDimSize, val);
 
-#define LARGE_INDEX(TYPE, DST_DIM, IDX_DIM)            \
-  indexFillLargeIndex<TYPE, DST_DIM, IDX_DIM>          \
-    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(  \
-      dstInfo, indicesInfo,                            \
+#define LARGE_INDEX(TYPE, DST_DIM, IDX_DIM)             \
+  indexFillLargeIndex<TYPE, DST_DIM, IDX_DIM>           \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(   \
+      dstInfo, indicesInfo,                             \
       dstFillDim, sliceSize, dstFillDimSize, val);
 
   dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8)));
@@ -546,13 +558,15 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT
   dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, 128L), (long)(mpc * 8)));
   dim3 largeIndexBlock(std::min(dstTotalSize, 128L));
 
-  if (THC_canUse32BitIndexMath(state, dst) &&
-      THC_canUse32BitIndexMath(state, indices)) {
-    TensorInfo<unsigned int> dstInfo(state, dst);
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) {
+    TensorInfo<float, unsigned int> dstInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, dst);
     int dstFillDim = dstInfo.collapseDims(dim);
-    dstInfo.sizes[dstFillDim] = 1;
+    dstInfo.reduceDim(dstFillDim);
 
-    TensorInfo<unsigned int> indicesInfo(state, indices);
+    TensorInfo<float, unsigned int> indicesInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, indices);
     indicesInfo.collapseDims();
 
     // A reasonable choice for when to have each thread iterate over
@@ -579,11 +593,13 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT
       }
     }
   } else {
-    TensorInfo<unsigned long> dstInfo(state, dst);
+    TensorInfo<float, unsigned long> dstInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, dst);
     int dstFillDim = dstInfo.collapseDims(dim);
-    dstInfo.sizes[dstFillDim] = 1;
+    dstInfo.reduceDim(dstFillDim);
 
-    TensorInfo<unsigned long> indicesInfo(state, indices);
+    TensorInfo<float, unsigned long> indicesInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, indices);
     indicesInfo.collapseDims();
 
     LARGE_INDEX(unsigned long, -1, -1);
@@ -600,9 +616,9 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT
 // indexSelectLargeIndex kernel is a better choice to increase
 // parallelism.
 template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst,
-                                      TensorInfo<IndexType> src,
-                                      TensorInfo<IndexType> indices,
+__global__ void indexSelectSmallIndex(TensorInfo<float, IndexType> dst,
+                                      TensorInfo<float, IndexType> src,
+                                      TensorInfo<float, IndexType> indices,
                                       int dstSelectDim,
                                       int srcSelectDim,
                                       IndexType innerSize,
@@ -615,7 +631,7 @@ __global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst,
   for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
     // Lua indices begin at 1
     IndexType srcIndex =
-      indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1;
+      indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1;
 
     if (srcIndex < srcSelectDimSize) {
       // We stride over the output ignoring the indexed dimension
@@ -624,11 +640,11 @@ __global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst,
            linearIndex < innerSize;
            linearIndex += gridDim.x * blockDim.x) {
         IndexType dstOffset =
-          IndexToOffset<IndexType, DstDim>::get(linearIndex, dst);
+          IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst);
         dstOffset += dstIndex * dst.strides[dstSelectDim];
 
         IndexType srcOffset =
-          IndexToOffset<IndexType, SrcDim>::get(linearIndex, src);
+          IndexToOffset<float, IndexType, SrcDim>::get(linearIndex, src);
         srcOffset += srcIndex * src.strides[srcSelectDim];
 
         dst.data[dstOffset] = src.data[srcOffset];
@@ -644,9 +660,9 @@ __global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst,
 // indexSelectSmallIndex kernel is a better choice to reduce memory
 // accesses.
 template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexSelectLargeIndex(TensorInfo<IndexType> dst,
-                                      TensorInfo<IndexType> src,
-                                      TensorInfo<IndexType> indices,
+__global__ void indexSelectLargeIndex(TensorInfo<float, IndexType> dst,
+                                      TensorInfo<float, IndexType> src,
+                                      TensorInfo<float, IndexType> indices,
                                       int dstSelectDim,
                                       int srcSelectDim,
                                       IndexType totalSize,
@@ -662,15 +678,15 @@ __global__ void indexSelectLargeIndex(TensorInfo<IndexType> dst,
 
     // Lua indices begin at 1
     IndexType srcIndex =
-      indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1;
+      indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1;
 
     if (srcIndex < srcSelectDimSize) {
       IndexType dstOffset =
-        IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst);
+        IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst);
       dstOffset += dstIndex * dst.strides[dstSelectDim];
 
       IndexType srcOffset =
-        IndexToOffset<IndexType, SrcDim>::get(elementInSlice, src);
+        IndexToOffset<float, IndexType, SrcDim>::get(elementInSlice, src);
       srcOffset += srcIndex * src.strides[srcSelectDim];
 
       dst.data[dstOffset] = src.data[srcOffset];
@@ -734,10 +750,10 @@ void THCudaTensor_indexSelect(THCState *state, THCudaTensor *dst, THCudaTensor *
       dstInfo, srcInfo, indicesInfo,                            \
       dstSelectDim, srcSelectDim, sliceSize, srcSelectDimSize);
 
-#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM)            \
-  indexSelectLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM>        \
-    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(           \
-      dstInfo, srcInfo, indicesInfo,                            \
+#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM)                    \
+  indexSelectLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM>                \
+    <<<largeIndexGrid, largeIndexBlock, 0, stream>>>(                   \
+      dstInfo, srcInfo, indicesInfo,                                    \
       dstSelectDim, srcSelectDim, dstTotalSize, sliceSize, srcSelectDimSize);
 
   dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8)));
@@ -746,18 +762,21 @@ void THCudaTensor_indexSelect(THCState *state, THCudaTensor *dst, THCudaTensor *
   dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, 128L), (long)(mpc * 8)));
   dim3 largeIndexBlock(std::min(dstTotalSize, 128L));
 
-  if (THC_canUse32BitIndexMath(state, dst) &&
-      THC_canUse32BitIndexMath(state, src) &&
-      THC_canUse32BitIndexMath(state, indices)) {
-    TensorInfo<unsigned int> dstInfo(state, dst);
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) {
+    TensorInfo<float, unsigned int> dstInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, dst);
     int dstSelectDim = dstInfo.collapseDims(dim);
-    dstInfo.sizes[dstSelectDim] = 1;
+    dstInfo.reduceDim(dstSelectDim);
 
-    TensorInfo<unsigned int> srcInfo(state, src);
+    TensorInfo<float, unsigned int> srcInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, src);
     int srcSelectDim = srcInfo.collapseDims(dim);
-    srcInfo.sizes[srcSelectDim] = 1;
+    srcInfo.reduceDim(srcSelectDim);
 
-    TensorInfo<unsigned int> indicesInfo(state, indices);
+    TensorInfo<float, unsigned int> indicesInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, indices);
     indicesInfo.collapseDims();
 
     // A reasonable choice for when to have each thread iterate over
@@ -784,15 +803,18 @@ void THCudaTensor_indexSelect(THCState *state, THCudaTensor *dst, THCudaTensor *
       }
     }
   } else {
-    TensorInfo<unsigned long> dstInfo(state, dst);
+    TensorInfo<float, unsigned long> dstInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, dst);
     int dstSelectDim = dstInfo.collapseDims(dim);
-    dstInfo.sizes[dstSelectDim] = 1;
+    dstInfo.reduceDim(dstSelectDim);
 
-    TensorInfo<unsigned long> srcInfo(state, src);
+    TensorInfo<float, unsigned long> srcInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, src);
     int srcSelectDim = srcInfo.collapseDims(dim);
-    srcInfo.sizes[srcSelectDim] = 1;
+    srcInfo.reduceDim(srcSelectDim);
 
-    TensorInfo<unsigned long> indicesInfo(state, indices);
+    TensorInfo<float, unsigned long> indicesInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, indices);
     indicesInfo.collapseDims();
 
     LARGE_INDEX(unsigned long, -1, -1, -1);
diff --git a/lib/THC/THCTensorInfo.cuh b/lib/THC/THCTensorInfo.cuh
new file mode 100644
index 0000000..5347116
--- /dev/null
+++ b/lib/THC/THCTensorInfo.cuh
@@ -0,0 +1,279 @@
+#ifndef THC_TENSOR_INFO_INC
+#define THC_TENSOR_INFO_INC
+
+#include <cuda.h>
+#include <assert.h>
+#include "THCGeneral.h"
+#include "THCTensor.h"
+
+// Maximum number of dimensions allowed for cutorch
+#define MAX_CUTORCH_DIMS 25
+
+// Warning string for tensor arguments that are too large or have too
+// many dimensions
+#define CUTORCH_STR(X) #X
+#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
+  CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
+
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+  TensorInfo(T* p,
+             int dim,
+             IndexType sz[MAX_CUTORCH_DIMS],
+             IndexType st[MAX_CUTORCH_DIMS]);
+
+  // Set the size of the given dimension to 1, as if it were a
+  // reduction dim (allows you to calculate offsets of the reduction
+  // slice)
+  void reduceDim(int dim);
+
+  // Collapses all runs of successive dimensions if the size/strides
+  // match up within the run and there are no holes between the
+  // dimensions.
+  // If excludeDim is set (not -1), then excludeDim will not be
+  // collapsed with any other dimension.
+  // Function returns the new dimension index that excludeDim maps to,
+  // since the collapsed dimensions are <= the input dimensions.
+  int collapseDims(int excludeDim = -1);
+
+  // Contiguous tensors of more than one dimension are collapsed down
+  // to one tensor
+  __host__ __device__ inline bool isContiguous() const {
+    return (dims == 1 && strides[0] == 1);
+  }
+
+  T* data;
+  IndexType sizes[MAX_CUTORCH_DIMS];
+  IndexType strides[MAX_CUTORCH_DIMS];
+  int dims;
+};
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+                                     int dim,
+                                     IndexType sz[MAX_CUTORCH_DIMS],
+                                     IndexType st[MAX_CUTORCH_DIMS]) {
+  data = p;
+  dims = dim;
+  assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
+
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+}
+
+template <typename T, typename IndexType>
+void
+TensorInfo<T, IndexType>::reduceDim(int dim) {
+  assert(dim < dims && dim >= 0);
+  sizes[dim] = 1;
+}
+
+template <typename T, typename IndexType>
+int
+TensorInfo<T, IndexType>::collapseDims(int excludeDim) {
+  // Find the innermost dimension not of size 1, since dimensions of size 1 are
+  // collapsible.
+  int firstNonOneDim = -1;
+
+  for (int i = dims - 1; i >= 0; --i) {
+    if (i == excludeDim) {
+      // We cannot collapse this dimension, even if it is size 1
+      firstNonOneDim = i;
+      break;
+    }
+
+    if (sizes[i] != 1) {
+      firstNonOneDim = i;
+      break;
+    }
+  }
+
+  // Special case: if all dimensions are of size 1, then this is a
+  // single-point tensor that we still have to operate on. Reduce to a
+  // single point.
+  if (firstNonOneDim == -1) {
+    assert(excludeDim == -1);
+
+    dims = 1;
+    sizes[0] = 1;
+    strides[0] = 1;
+
+    // Everything effectively got collapsed into this dimension
+    return 0;
+  }
+
+  // Count the number of successive dimensions that can be collapsed, from
+  // innermost to outermost.
+  int numCollapsed = 0;
+
+  // Skip the leading size 1 dims
+  numCollapsed += dims - 1 - firstNonOneDim;
+
+  // We perform one pass through to determine how many dimensions we
+  // can collapse, before calculating the actual size of the collapsed
+  // dimensions.
+  // size/strideInner are the size/strides of the previous inner
+  // non-collapsible dim we encounter.
+  long sizeInner = sizes[firstNonOneDim];
+  long strideInner = strides[firstNonOneDim];
+
+  for (int i = firstNonOneDim - 1; i >= 0; --i) {
+    long sizeOuter = sizes[i];
+    long strideOuter = strides[i];
+
+    // Don't collapse this dimension if we want to exclude it from
+    // collapsing.
+    // Since this code is attempting to collapse a subsequent
+    // dimension (i) with the preceding dimension (i + 1), we can only
+    // perform collapsing if the preceding dimension can be collapsed
+    // (i.e., not excludeDim)
+    if ((excludeDim != i) && (excludeDim != i + 1)) {
+      // The next outermost dimension can be skipped if size 1
+      if (sizeOuter == 1) {
+        ++numCollapsed;
+        continue;
+      }
+
+      // If the next outermost dimension is contiguous with the
+      // previous non-collapsed one, collapse it
+      if (strideOuter == strideInner * sizeInner) {
+        ++numCollapsed;
+
+        // This is the run of collapsed dimensions' size
+        sizeInner = sizeInner * sizeOuter;
+        continue;
+      }
+    }
+
+    // Otherwise, this new outer dimension at `i` cannot be collapsed
+    // because it is excluded from collapsing, or it is not contiguous
+    // with the previous inner dimension.
+    sizeInner = sizeOuter;
+    strideInner = strideOuter;
+  }
+
+  // This will be our new size/stride and dimension.
+  IndexType newSizes[MAX_CUTORCH_DIMS];
+  IndexType newStrides[MAX_CUTORCH_DIMS];
+
+  assert(numCollapsed < dims);
+  int newDims = dims - numCollapsed;
+
+  // We return the index of the excluded dimension that is excluded
+  // from being collapsed here.
+  int returnDim = -1;
+
+  // We perform a second pass through the dimensions to actually
+  // calculate the size of the collapsed dimensions.
+  int collapsedIndex = dims - numCollapsed - 1;
+  newSizes[collapsedIndex] = sizes[firstNonOneDim];
+  newStrides[collapsedIndex] = strides[firstNonOneDim];
+
+  if (firstNonOneDim == excludeDim) {
+    returnDim = collapsedIndex;
+  }
+
+  for (int i = firstNonOneDim - 1; i >= 0; --i) {
+    IndexType sizeOuter = sizes[i];
+    IndexType strideOuter = strides[i];
+
+    if ((excludeDim != i) && (excludeDim != i + 1)) {
+      if (sizeOuter == 1) {
+        // skip
+        continue;
+      }
+
+      if (strideOuter == newSizes[collapsedIndex] * newStrides[collapsedIndex]) {
+        // collapse
+        newSizes[collapsedIndex] *= sizeOuter;
+        continue;
+      }
+    }
+
+    // Otherwise, strides don't match, or dim `i` is excluded from
+    // collapsing.
+    --collapsedIndex;
+    assert(collapsedIndex >= 0);
+    assert(collapsedIndex < newDims);
+    newSizes[collapsedIndex] = sizeOuter;
+    newStrides[collapsedIndex] = strideOuter;
+
+    if (excludeDim == i) {
+      returnDim = collapsedIndex;
+    }
+  }
+
+  // We must have filled all the dimensions we're looking for
+  assert(collapsedIndex == 0);
+  assert((excludeDim == -1) || (returnDim != -1));
+
+  dims = newDims;
+
+  for (int i = 0; i < dims; ++i) {
+    sizes[i] = newSizes[i];
+    strides[i] = newStrides[i];
+  }
+
+  // After collapsing, the original `excludeDim` may have been
+  // renumbered to this new `returnDim`, since some dimensions could
+  // have been collapsed.
+  return returnDim;
+}
+
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+  static __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+    IndexType offset = 0;
+
+    // Use static dims
+    for (int i = Dims - 1; i >= 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+
+      if (i > 0) {
+        linearId /= info.sizes[i];
+      }
+    }
+
+    return offset;
+  }
+};
+
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -2> {
+  static inline __host__ __device__ IndexType
+    get(IndexType linearId, const TensorInfo<T, IndexType>& info) {
+    return linearId;
+  }
+};
+
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+  static inline __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+    IndexType offset = 0;
+
+    // Use dynamic dims
+    for (int i = info.dims - 1; i >= 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+
+      linearId /= info.sizes[i];
+    }
+
+    return offset;
+  }
+};
+
+#endif // THC_TENSOR_INFO_INC
diff --git a/lib/THC/THCTensorMasked.cu b/lib/THC/THCTensorMasked.cu
index ba69425..6ee5bad 100644
--- a/lib/THC/THCTensorMasked.cu
+++ b/lib/THC/THCTensorMasked.cu
@@ -33,7 +33,7 @@ void THCudaTensor_maskedFill(THCState* state,
              THCudaTensor_nElement(state, mask),
              2, "sizes do not match");
 
-  if (!THCudaTensor_pointwiseApply2(state, tensor, mask, TensorMaskedFillOp(value))) {
+  if (!THC_pointwiseApply2(state, tensor, mask, TensorMaskedFillOp(value))) {
     THArgCheck(false, 2, CUTORCH_DIM_WARNING);
   }
 
@@ -106,7 +106,7 @@ void THCudaTensor_maskedCopy(THCState* state,
 
   // update `tensor` where `mask` == 1 but pull from `src` at
   // maskPrefixSum
-  bool status = THCudaTensor_pointwiseApply3(
+  bool status = THC_pointwiseApply3(
     state, tensor, contigMask, maskPrefixSum,
     TensorMaskedCopyOp(THCudaTensor_data(state, contigSrc)));
 
@@ -169,7 +169,7 @@ void THCudaTensor_maskedSelect(THCState* state,
                          maskPrefixSumData);
 
   // Then copy over the masked elements at their desired output index
-  bool status = THCudaTensor_pointwiseApply3(
+  bool status = THC_pointwiseApply3(
     state, contigMask, maskPrefixSum,
     src, TensorMaskedSelectOp(THCudaTensor_data(state, tensor)));
 
diff --git a/lib/THC/THCTensorMath.cu b/lib/THC/THCTensorMath.cu
index 0690edf..5e140ef 100644
--- a/lib/THC/THCTensorMath.cu
+++ b/lib/THC/THCTensorMath.cu
@@ -9,66 +9,6 @@
 #include <thrust/functional.h>
 #include <cfloat>
 
-struct TensorFillOp {
-  TensorFillOp(float v) : val(v) {}
-  __device__ __forceinline__ void operator()(float* v) { *v = val; }
-
-  const float val;
-};
-
-void THCudaTensor_fill(THCState* state, THCudaTensor *self_, float value)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, self_));
-  if (!THCudaTensor_pointwiseApply1(state, self_, TensorFillOp(value))) {
-    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
-void THCudaTensor_zero(THCState *state, THCudaTensor *self_)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, self_));
-  if (THCudaTensor_isContiguous(state, self_)) {
-    THCudaCheck(cudaMemsetAsync(THCudaTensor_data(state, self_),
-                                0,
-                                sizeof(float) * THCudaTensor_nElement(state, self_),
-                                THCState_getCurrentStream(state)));
-  } else {
-    if (!THCudaTensor_pointwiseApply1(state, self_, TensorFillOp(0))) {
-      THArgCheck(false, 1, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
-void THCudaTensor_zeros(THCState *state, THCudaTensor *r_, THLongStorage *size)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, r_));
-  THCudaTensor_resize(state, r_, size, NULL);
-  THCudaTensor_zero(state, r_);
-}
-
-void THCudaTensor_ones(THCState *state, THCudaTensor *r_, THLongStorage *size)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, r_));
-  THCudaTensor_resize(state, r_, size, NULL);
-  THCudaTensor_fill(state, r_, 1);
-}
-
-void THCudaTensor_reshape(THCState *state, THCudaTensor *r_, THCudaTensor *t, THLongStorage *size)
-{
-  THAssert(THCudaTensor_checkGPU(state, 2, r_, t));
-  THCudaTensor_resize(state, r_, size, NULL);
-  THCudaTensor_copy(state, r_, t);
-}
-
-long THCudaTensor_numel(THCState *state, THCudaTensor *t)
-{
-  return THCudaTensor_nElement(state, t);
-}
-
 void THCudaTensor_cat(THCState *state, THCudaTensor *result, THCudaTensor *ta, THCudaTensor *tb, int dimension)
 {
   THCudaTensor* inputs[2];
@@ -138,74 +78,6 @@ void THCudaTensor_catArray(THCState *state, THCudaTensor *result, THCudaTensor *
   }
 }
 
-struct TensorCPowOp {
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out = powf(*out, *in);
-  }
-
-  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
-    *out = powf(*in1, *in2);
-  }
-};
-
-void THCudaTensor_cpow(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
-  THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
-  THArgCheck(THCudaTensor_nElement(state, src1) ==
-             THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
-  if (self_ == src1) {
-    // self = pow(self, src2)
-    if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorCPowOp())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCudaTensor_resizeAs(state, self_, src1);
-
-    // self = pow(src1, src2)
-    if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorCPowOp())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
-struct TensorDivOp {
-  __device__ __forceinline__ void
-  operator()(float* out, float* in) {
-    *out /= *in;
-  }
-
-  __device__ __forceinline__ void
-  operator()(float* out, float* in1, float* in2) {
-    *out = *in1 / *in2;
-  }
-};
-
-void THCudaTensor_cdiv(THCState* state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
-  THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
-  THArgCheck(THCudaTensor_nElement(state, src1) ==
-             THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
-  if (self_ == src1) {
-    // self *= src2
-    if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorDivOp())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCudaTensor_resizeAs(state, self_, src1);
-
-    // self = src1 * src2
-    if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorDivOp())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
 struct TensorAddCMulOp {
   TensorAddCMulOp(float v) : val(v) {}
 
@@ -234,7 +106,7 @@ void THCudaTensor_addcmul(THCState *state, THCudaTensor *self_, THCudaTensor *t,
   THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2),
              3, "sizes do not match");
 
-  if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorAddCMulOp(value))) {
+  if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddCMulOp(value))) {
     THArgCheck(false, 2, CUTORCH_DIM_WARNING);
   }
 
@@ -268,7 +140,7 @@ void THCudaTensor_addcdiv(THCState *state, THCudaTensor *self_, THCudaTensor *t,
   THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2),
              3, "sizes do not match");
 
-  if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorAddCDivOp(value))) {
+  if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddCDivOp(value))) {
     THArgCheck(false, 2, CUTORCH_DIM_WARNING);
   }
 
@@ -400,3 +272,14 @@ int THCudaTensor_logicalany(THCState *state, THCudaTensor *self) {
 
   return (int) result;
 }
+
+template <typename T>
+struct TensorFillOp {
+  TensorFillOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* v) { *v = val; }
+
+  const T val;
+};
+
+#include "generic/THCTensorMath.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h
index e31cb2a..6593da2 100644
--- a/lib/THC/THCTensorMath.h
+++ b/lib/THC/THCTensorMath.h
@@ -4,28 +4,19 @@
 #include "THCTensor.h"
 #include "THCGeneral.h"
 
-THC_API void THCudaTensor_fill(THCState *state, THCudaTensor *self, float value);
-THC_API void THCudaTensor_zero(THCState *state, THCudaTensor *self);
+#include "generic/THCTensorMath.h"
+#include "THCGenerateAllTypes.h"
 
-THC_API void THCudaTensor_zeros(THCState *state, THCudaTensor *r_, THLongStorage *size);
-THC_API void THCudaTensor_ones(THCState *state, THCudaTensor *r_, THLongStorage *size);
-THC_API void THCudaTensor_reshape(THCState *state, THCudaTensor *r_, THCudaTensor *t, THLongStorage *size);
-THC_API long THCudaTensor_numel(THCState *state, THCudaTensor *t);
+#include "generic/THCTensorMathPairwise.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathPointwise.h"
+#include "THCGenerateAllTypes.h"
 
-THC_API void THCudaTensor_add(THCState *state, THCudaTensor *self, THCudaTensor *src, float value);
-THC_API void THCudaTensor_sub(THCState *state, THCudaTensor *self, THCudaTensor *src, float value);
-THC_API void THCudaTensor_mul(THCState *state, THCudaTensor *self, THCudaTensor *src, float value);
-THC_API void THCudaTensor_div(THCState *state, THCudaTensor *self, THCudaTensor *src, float value);
 
 THC_API void THCudaTensor_tril(THCState *state, THCudaTensor *self, THCudaTensor *src, long k);
 THC_API void THCudaTensor_triu(THCState *state, THCudaTensor *self, THCudaTensor *src, long k);
 
-THC_API void THCudaTensor_cadd(THCState *state, THCudaTensor *self, THCudaTensor *src1, float value, THCudaTensor *src2);
-THC_API void THCudaTensor_csub(THCState *state, THCudaTensor *self, THCudaTensor *src1, float value, THCudaTensor *src2);
-THC_API void THCudaTensor_cmul(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor *src2);
-THC_API void THCudaTensor_cpow(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor *src2);
-THC_API void THCudaTensor_cdiv(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor *src2);
-
 THC_API void THCudaTensor_addcmul(THCState *state, THCudaTensor *self, THCudaTensor* t, float value, THCudaTensor *src1, THCudaTensor *src2);
 THC_API void THCudaTensor_addcdiv(THCState *state, THCudaTensor *self, THCudaTensor* t, float value, THCudaTensor *src1, THCudaTensor *src2);
 
diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu
index 2cc2adb..0d6bb52 100644
--- a/lib/THC/THCTensorMath2.cu
+++ b/lib/THC/THCTensorMath2.cu
@@ -31,13 +31,13 @@ void THCudaTensor_pow(THCState *state, THCudaTensor *self_, THCudaTensor *src, f
 {
   THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
   if (self_ == src) {
-    if (!THCudaTensor_pointwiseApply1(state, self_, TensorPowOp(value))) {
+    if (!THC_pointwiseApply1(state, self_, TensorPowOp(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self_, src);
 
-    if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorPowOp(value))) {
+    if (!THC_pointwiseApply2(state, self_, src, TensorPowOp(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
@@ -63,13 +63,13 @@ void THCudaTensor_tpow(THCState *state, THCudaTensor *self_, float value, THCuda
 {
   THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
   if (self_ == src) {
-    if (!THCudaTensor_pointwiseApply1(state, self_, TensorTPowOp(value))) {
+    if (!THC_pointwiseApply1(state, self_, TensorTPowOp(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self_, src);
 
-    if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorTPowOp(value))) {
+    if (!THC_pointwiseApply2(state, self_, src, TensorTPowOp(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
@@ -90,7 +90,7 @@ void THCudaTensor_atan2(THCState *state, THCudaTensor *self_, THCudaTensor *tx,
              THCudaTensor_nElement(state, ty), 3, "sizes do not match");
   THCudaTensor_resizeAs(state, self_, tx);
 
-  if (!THCudaTensor_pointwiseApply3(state, self_, tx, ty, TensorATan2Op())) {
+  if (!THC_pointwiseApply3(state, self_, tx, ty, TensorATan2Op())) {
     THArgCheck(false, 2, CUTORCH_DIM_WARNING);
   }
 
@@ -116,13 +116,13 @@ void THCudaTensor_clamp(THCState *state, THCudaTensor *self_, THCudaTensor *src,
 {
   THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
   if (self_ == src) {
-    if (!THCudaTensor_pointwiseApply1(state, self_, TensorClampOp(min_value, max_value))) {
+    if (!THC_pointwiseApply1(state, self_, TensorClampOp(min_value, max_value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self_, src);
 
-    if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorClampOp(min_value, max_value))) {
+    if (!THC_pointwiseApply2(state, self_, src, TensorClampOp(min_value, max_value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
@@ -146,13 +146,13 @@ void THCudaTensor_sign(THCState *state, THCudaTensor *self_, THCudaTensor *src)
 {
   THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
   if (self_ == src) {
-    if (!THCudaTensor_pointwiseApply1(state, self_, TensorSignOp())) {
+    if (!THC_pointwiseApply1(state, self_, TensorSignOp())) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self_, src);
 
-    if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorSignOp())) {
+    if (!THC_pointwiseApply2(state, self_, src, TensorSignOp())) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
@@ -192,7 +192,7 @@ void THCudaTensor_lerp(THCState *state, THCudaTensor *result, THCudaTensor *a, T
              THCudaTensor_nElement(state, b), 3, "sizes do not match");
   THCudaTensor_resizeAs(state, result, a);
 
-  if (!THCudaTensor_pointwiseApply3(state, result, a, b, TensorLerpOp(w))) {
+  if (!THC_pointwiseApply3(state, result, a, b, TensorLerpOp(w))) {
     THArgCheck(false, 2, CUTORCH_DIM_WARNING);
   }
 
@@ -699,7 +699,7 @@ THC_API void THCudaTensor_cross(THCState *state, THCudaTensor *self, THCudaTenso
   THCudaTensor *nx = THCudaTensor_newNarrow(state, x, dimension, 0, 1);
   THCudaTensor *ny = THCudaTensor_newNarrow(state, y, dimension, 0, 1);
   THCudaTensor *nself = THCudaTensor_newNarrow(state, self, dimension, 0, 1);
-  if (!THCudaTensor_pointwiseApply3(state, nself, nx, ny, TensorCrossOp(sx, sy, so))) {
+  if (!THC_pointwiseApply3(state, nself, nx, ny, TensorCrossOp(sx, sy, so))) {
     THArgCheck(false, 2, CUTORCH_DIM_WARNING);
   }
   THCudaTensor_free(state, nx);
diff --git a/lib/THC/THCTensorMathCompare.cu b/lib/THC/THCTensorMathCompare.cu
index 0f7d979..71cfdd5 100644
--- a/lib/THC/THCTensorMathCompare.cu
+++ b/lib/THC/THCTensorMathCompare.cu
@@ -11,7 +11,7 @@ void THCudaTensor_logicalValue(THCState *state, THCudaTensor *self_, THCudaTenso
 {
   THCudaTensor_resizeAs(state, self_, src);
 
-  if (!THCudaTensor_pointwiseApply2(state, self_, src, op)) {
+  if (!THC_pointwiseApply2(state, self_, src, op)) {
     THArgCheck(false, 2, CUTORCH_DIM_WARNING);
   }
 
diff --git a/lib/THC/THCTensorMathCompareT.cu b/lib/THC/THCTensorMathCompareT.cu
index b88f153..0567a89 100644
--- a/lib/THC/THCTensorMathCompareT.cu
+++ b/lib/THC/THCTensorMathCompareT.cu
@@ -11,7 +11,7 @@ void THCudaTensor_logicalTensor(THCState *state, THCudaTensor *self_, THCudaTens
   THCudaTensor_resizeAs(state, self_, src1);
   THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2), 3, "sizes do not match");
 
-  if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, op)) {
+  if (!THC_pointwiseApply3(state, self_, src1, src2, op)) {
     THArgCheck(false, 2, CUTORCH_DIM_WARNING);
   }
 
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
index 383344b..2c081d1 100644
--- a/lib/THC/THCTensorMathPairwise.cu
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -1,96 +1,98 @@
 #include "THCTensorMath.h"
 #include "THCGeneral.h"
 #include "THCBlas.h"
+#include "THCHalf.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCReduce.cuh"
 
+template <typename T>
 struct TensorAddConstantOp {
-  TensorAddConstantOp(float v) : val(v) {}
-  __device__ __forceinline__ void operator()(float* out, float* in) {
+  TensorAddConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
     *out = *in + val;
   }
 
-  __device__ __forceinline__ void operator()(float* v) {
+  __device__ __forceinline__ void operator()(T* v) {
     *v += val;
   }
 
-  const float val;
+  const T val;
 };
 
-void THCudaTensor_add(THCState *state, THCudaTensor *self_, THCudaTensor *src_, float value)
-{
-  THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
-  if (self_ == src_) {
-    if (!THCudaTensor_pointwiseApply1(state, self_, TensorAddConstantOp(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCudaTensor_resizeAs(state, self_, src_);
-
-    if (!THCudaTensor_pointwiseApply2(state, self_, src_, TensorAddConstantOp(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorAddConstantOp<half> {
+  TensorAddConstantOp(half v) : val(v) {}
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in, val);
+#else
+    float fin = __half2float(*in);
+    float fval = __half2float(val);
+    float fout = fin + fval;
+    *out = __float2half(fout);
+#endif
   }
 
-  THCudaCheck(cudaGetLastError());
-}
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hadd(*v, val);
+#else
+    float fv = __half2float(*v);
+    float fval = __half2float(val);
+    fv += fval;
+    *v = __float2half(fv);
+#endif
+  }
 
-void THCudaTensor_sub(THCState *state, THCudaTensor *self_, THCudaTensor *src_, float value)
-{
-  THCudaTensor_add(state, self_, src_, -value);
-}
+  const half val;
+};
+#endif // CUDA_HALF_TENSOR
 
+template <typename T>
 struct TensorMulConstantOp {
-  TensorMulConstantOp(float v) : val(v) {}
-  __device__ __forceinline__ void operator()(float* out, float* in) {
+  TensorMulConstantOp(T v) : val(v) {}
+  __device__ __forceinline__ void operator()(T* out, T* in) {
     *out = *in * val;
   }
 
-  __device__ __forceinline__ void operator()(float* v) {
+  __device__ __forceinline__ void operator()(T* v) {
     *v *= val;
   }
 
-  const float val;
+  const T val;
 };
 
-void THCudaTensor_mul(THCState *state, THCudaTensor *self_, THCudaTensor *src_, float value)
-{
-  THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
-  if (self_ == src_) {
-    if (!THCudaTensor_pointwiseApply1(state, self_, TensorMulConstantOp(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCudaTensor_resizeAs(state, self_, src_);
-
-    if (!THCudaTensor_pointwiseApply2(state, self_, src_, TensorMulConstantOp(value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorMulConstantOp<half> {
+  TensorMulConstantOp(half v) : val(v) {}
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*in, val);
+#else
+    float fin = __half2float(*in);
+    float fval = __half2float(val);
+    float fout = fin * fval;
+    *out = __float2half(fout);
+#endif
   }
 
-  THCudaCheck(cudaGetLastError());
-}
-
-void THCudaTensor_div(THCState* state, THCudaTensor *self_, THCudaTensor *src_, float value)
-{
-  THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
-  THArgCheck(value != 0.0f, 3, "divide by zero");
-
-  if (self_ == src_) {
-    if (!THCudaTensor_pointwiseApply1(state, self_, TensorMulConstantOp(1.0f / value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCudaTensor_resizeAs(state, self_, src_);
-
-    if (!THCudaTensor_pointwiseApply2(state, self_, src_, TensorMulConstantOp(1.0f / value))) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
+  __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *v = __hmul(*v, val);
+#else
+    float fv = __half2float(*v);
+    float fval = __half2float(val);
+    fv *= fval;
+    *v = __float2half(fv);
+#endif
   }
 
-  THCudaCheck(cudaGetLastError());
-}
+  const half val;
+};
+#endif // CUDA_HALF_TENSOR
 
 template <int Upper>
 struct TensorTriOp {
@@ -143,13 +145,13 @@ void THCudaTensor_tril(THCState *state, THCudaTensor *self_, THCudaTensor *src_,
   TensorTriOp<0> op(start, stride0, stride1, k);
 
   if (self_ == src_) {
-    if (!THCudaTensor_pointwiseApply1(state, src, op)) {
+    if (!THC_pointwiseApply1(state, src, op)) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self_, src);
 
-    if (!THCudaTensor_pointwiseApply2(state, self_, src, op)) {
+    if (!THC_pointwiseApply2(state, self_, src, op)) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
@@ -176,13 +178,13 @@ void THCudaTensor_triu(THCState *state, THCudaTensor *self_, THCudaTensor *src_,
   TensorTriOp<1> op(start, stride0, stride1, k);
 
   if (self_ == src_) {
-    if (!THCudaTensor_pointwiseApply1(state, src, op)) {
+    if (!THC_pointwiseApply1(state, src, op)) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self_, src);
 
-    if (!THCudaTensor_pointwiseApply2(state, self_, src, op)) {
+    if (!THC_pointwiseApply2(state, self_, src, op)) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
@@ -192,3 +194,6 @@ void THCudaTensor_triu(THCState *state, THCudaTensor *self_, THCudaTensor *src_,
 
   THCudaCheck(cudaGetLastError());
 }
+
+#include "generic/THCTensorMathPairwise.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathPointwise.cu b/lib/THC/THCTensorMathPointwise.cu
index 80b1ead..72b16e8 100644
--- a/lib/THC/THCTensorMathPointwise.cu
+++ b/lib/THC/THCTensorMathPointwise.cu
@@ -1,6 +1,7 @@
 #include "THCTensorMath.h"
 #include "THCGeneral.h"
 #include "THCBlas.h"
+#include "THCHalf.h"
 #include "THCTensorCopy.h"
 #include "THCApply.cuh"
 #include "THCReduce.cuh"
@@ -19,13 +20,13 @@
   void THCudaTensor_##NAME(THCState* state, THCudaTensor* self_, THCudaTensor* src) { \
     THAssert(THCudaTensor_checkGPU(state, 2, self_, src));                \
     if (self_ == src) {                                                 \
-      if (!THCudaTensor_pointwiseApply1(state, self_, Tensor##NAME##Op())) { \
+      if (!THC_pointwiseApply1(state, self_, Tensor##NAME##Op())) { \
         THArgCheck(false, 2, CUTORCH_DIM_WARNING); \
       }                                                                 \
     } else {                                                            \
       THCudaTensor_resizeAs(state, self_, src);                         \
                                                                         \
-      if (!THCudaTensor_pointwiseApply2(state, self_, src, Tensor##NAME##Op())) { \
+      if (!THC_pointwiseApply2(state, self_, src, Tensor##NAME##Op())) { \
         THArgCheck(false, 2, CUTORCH_DIM_WARNING); \
       }                                                                 \
     }                                                                   \
@@ -76,171 +77,13 @@ struct TensorSigmoidOp {
 void THCudaTensor_sigmoid(THCState* state, THCudaTensor* self_, THCudaTensor* src) {
   THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
   if (self_ == src) {
-    if (!THCudaTensor_pointwiseApply1(state, self_, TensorSigmoidOp())) {
+    if (!THC_pointwiseApply1(state, self_, TensorSigmoidOp())) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self_, src);
 
-    if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorSigmoidOp())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
-struct TensorAddOp {
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out += *in;
-  }
-
-  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
-    *out = *in1 + *in2;
-  }
-};
-
-struct TensorCAddOp {
-  TensorCAddOp(float v) : val(v) {}
-
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out += val * *in;
-  }
-
-  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
-    *out = *in1 + val * *in2;
-  }
-
-  float val;
-};
-
-void THCudaTensor_cadd(THCState *state, THCudaTensor *self_, THCudaTensor* src1, float value, THCudaTensor *src2)
-{
-  THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
-  THArgCheck(THCudaTensor_nElement(state, src1) ==
-             THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
-  if (self_ == src1) {
-    if (value == 1.0f) {
-      // self += src2
-      if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorAddOp())) {
-        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-      }
-    } else {
-      // self += value * src2
-      if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorCAddOp(value))) {
-        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-      }
-    }
-  } else {
-    THCudaTensor_resizeAs(state, self_, src1);
-
-    if (value == 1.0f) {
-      // self = src1 + src2
-      if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorAddOp())) {
-        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-      }
-    } else {
-      // self = src1 + value * src2
-      if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorCAddOp(value))) {
-        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-      }
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
-struct TensorSubOp {
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out -= *in;
-  }
-
-  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
-    *out = *in1 - *in2;
-  }
-};
-
-
-struct TensorCSubOp {
-  TensorCSubOp(float v) : val(v) {}
-
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out -= val * *in;
-  }
-
-  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
-    *out = *in1 - val * *in2;
-  }
-
-  float val;
-};
-
-
-void THCudaTensor_csub(THCState *state, THCudaTensor *self_, THCudaTensor* src1, float value, THCudaTensor *src2)
-{
-  THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
-  THArgCheck(THCudaTensor_nElement(state, src1) ==
-             THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
-  if (self_ == src1) {
-    if (value == 1.0f) {
-      // self -= src2
-      if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorSubOp())) {
-        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-      }
-    } else {
-      // self += -value * src2
-      if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorCAddOp(-value))) {
-        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-      }
-    }
-  } else {
-    THCudaTensor_resizeAs(state, self_, src1);
-
-    if (value == 1.0f) {
-      // self = src1 - src2
-      if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorSubOp())) {
-        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-      }
-    } else {
-      // self = src1 - value * src2
-      if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorCAddOp(-value))) {
-        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-      }
-    }
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
-
-struct TensorMulOp {
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out *= *in;
-  }
-
-  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
-    *out = *in1 * *in2;
-  }
-};
-
-void THCudaTensor_cmul(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
-  THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
-  THArgCheck(THCudaTensor_nElement(state, src1) ==
-             THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
-  if (self_ == src1) {
-    // self *= src2
-    if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorMulOp())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCudaTensor_resizeAs(state, self_, src1);
-
-    // self = src1 * src2
-    if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorMulOp())) {
+    if (!THC_pointwiseApply2(state, self_, src, TensorSigmoidOp())) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
@@ -265,12 +108,12 @@ void THCudaTensor_cmax(THCState *state, THCudaTensor *self, THCudaTensor *src1,
              THCudaTensor_nElement(state, src2), 2, "sizes do not match");
 
   if (self == src1) {
-    if (!THCudaTensor_pointwiseApply2(state, self, src2, TensorMaxOp())) {
+    if (!THC_pointwiseApply2(state, self, src2, TensorMaxOp())) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self, src1);
-    if (!THCudaTensor_pointwiseApply3(state, self, src1, src2, TensorMaxOp())) {
+    if (!THC_pointwiseApply3(state, self, src1, src2, TensorMaxOp())) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
@@ -293,12 +136,12 @@ void THCudaTensor_cmin(THCState *state, THCudaTensor *self, THCudaTensor *src1,
              THCudaTensor_nElement(state, src2), 2, "sizes do not match");
 
   if (self == src1) {
-    if (!THCudaTensor_pointwiseApply2(state, self, src2, TensorMinOp())) {
+    if (!THC_pointwiseApply2(state, self, src2, TensorMinOp())) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self, src1);
-    if (!THCudaTensor_pointwiseApply3(state, self, src1, src2, TensorMinOp())) {
+    if (!THC_pointwiseApply3(state, self, src1, src2, TensorMinOp())) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
@@ -323,12 +166,12 @@ void THCudaTensor_cmaxValue(THCState *state, THCudaTensor *self, THCudaTensor *s
   THAssert(THCudaTensor_checkGPU(state, 2, self, src));
 
   if (self == src) {
-    if (!THCudaTensor_pointwiseApply1(state, self, TensorMaxValueOp(value))) {
+    if (!THC_pointwiseApply1(state, self, TensorMaxValueOp(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self, src);
-    if (!THCudaTensor_pointwiseApply2(state, self, src, TensorMaxValueOp(value))) {
+    if (!THC_pointwiseApply2(state, self, src, TensorMaxValueOp(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
@@ -353,13 +196,259 @@ void THCudaTensor_cminValue(THCState *state, THCudaTensor *self, THCudaTensor *s
   THAssert(THCudaTensor_checkGPU(state, 2, self, src));
 
   if (self == src) {
-    if (!THCudaTensor_pointwiseApply1(state, self, TensorMinValueOp(value))) {
+    if (!THC_pointwiseApply1(state, self, TensorMinValueOp(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   } else {
     THCudaTensor_resizeAs(state, self, src);
-    if (!THCudaTensor_pointwiseApply2(state, self, src, TensorMinValueOp(value))) {
+    if (!THC_pointwiseApply2(state, self, src, TensorMinValueOp(value))) {
       THArgCheck(false, 2, CUTORCH_DIM_WARNING);
     }
   }
 }
+
+template <typename T>
+struct TensorAddOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out += *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 + *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorAddOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*out, *in);
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout += fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in1, *in2);
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 + fin2;
+    *out = __float2half(fout);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorCAddOp {
+  TensorCAddOp(T v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out += val * *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 + val * *in2;
+  }
+
+  T val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCAddOp<half> {
+  TensorCAddOp(half v) : val(v) {}
+
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*out, __hmul(val, *in));
+#else
+    float fout = __half2float(*out);
+    float fval = __half2float(val);
+    float fin = __half2float(*in);
+
+    fout += fval * fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hadd(*in1, __hmul(val, *in2));
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fval = __half2float(val);
+
+    float fout = fin1 + fval * fin2;
+    *out = __float2half(fout);
+#endif
+  }
+
+  half val;
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorSubOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out -= *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 - *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSubOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*out, *in);
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout -= fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hsub(*in1, *in2);
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 - fin2;
+    *out = __float2half(fout);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorMulOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out *= *in;
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = *in1 * *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorMulOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*out, *in);
+#else
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout *= fin;
+    *out = __float2half(fout);
+#endif
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    *out = __hmul(*in1, *in2);
+#else
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 * fin2;
+    *out = __float2half(fout);
+#endif
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorCPowOp {
+  __device__ __forceinline__ void operator()(T* out, T* in) {
+    *out = powf((float) *out, (float) *in);
+  }
+
+  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+    *out = powf((float) *in1, (float) *in2);
+  }
+};
+
+template <>
+struct TensorCPowOp<double> {
+  __device__ __forceinline__ void operator()(double* out, double* in) {
+    *out = pow(*out, *in);
+  }
+
+  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
+    *out = pow(*in1, *in2);
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCPowOp<half> {
+  __device__ __forceinline__ void operator()(half* out, half* in) {
+    // No fp16 pow function yet
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout = powf(fout, fin);
+    *out = __float2half(fout);
+  }
+
+  __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+    // No fp16 pow function yet
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = powf(fin1, fin2);
+    *out = __float2half(fout);
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorDivOp {
+  __device__ __forceinline__ void
+  operator()(T* out, T* in) {
+    *out /= *in;
+  }
+
+  __device__ __forceinline__ void
+  operator()(T* out, T* in1, T* in2) {
+    *out = *in1 / *in2;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorDivOp<half> {
+  __device__ __forceinline__ void
+  operator()(half* out, half* in) {
+    // No fp16 div instruction yet
+    float fout = __half2float(*out);
+    float fin = __half2float(*in);
+    fout /= fin;
+    *out = __float2half(fout);
+  }
+
+  __device__ __forceinline__ void
+  operator()(half* out, half* in1, half* in2) {
+    // No fp16 div instruction yet
+    float fin1 = __half2float(*in1);
+    float fin2 = __half2float(*in2);
+    float fout = fin1 / fin2;
+    *out = __float2half(fout);
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+#include "generic/THCTensorMathPointwise.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorScatterGather.cu b/lib/THC/THCTensorScatterGather.cu
index 876f32a..1059b03 100644
--- a/lib/THC/THCTensorScatterGather.cu
+++ b/lib/THC/THCTensorScatterGather.cu
@@ -2,7 +2,6 @@
 #include "THCGeneral.h"
 #include "THCApply.cuh"
 
-
 // Compute the offsets into the given tensors for a linear index. For the 't2'
 // tensor, dimension 'dim' is skipped. The tensors are assumed to have the same
 // size (with the exception of 't2' in dimension 'dim').
@@ -11,9 +10,9 @@ template <typename IndexType, int Dims>
 struct IndexToScatterGatherOffsets {
   static __device__ void compute(
       IndexType linearId, const int dim,
-      const TensorInfo<IndexType>& index, IndexType* indexOffset,
-      const TensorInfo<IndexType>& t1, IndexType* t1Offset,
-      const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+      const TensorInfo<float, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<float, IndexType>& t1, IndexType* t1Offset,
+      const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) {
     for (int d = Dims - 1; d >= 0; d--) {
       IndexType curDimIndex = linearId % index.sizes[d];
       *indexOffset += curDimIndex * index.strides[d];
@@ -27,8 +26,8 @@ struct IndexToScatterGatherOffsets {
 
   static __device__ void compute(
       IndexType linearId, const int dim,
-      const TensorInfo<IndexType>& index, IndexType* indexOffset,
-      const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+      const TensorInfo<float, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) {
     for (int d = Dims - 1; d >= 0; d--) {
       IndexType curDimIndex = linearId % index.sizes[d];
       *indexOffset += curDimIndex * index.strides[d];
@@ -45,9 +44,9 @@ template <typename IndexType>
 struct IndexToScatterGatherOffsets<IndexType, -1> {
   static __device__ void compute(
       IndexType linearId, const int dim,
-      const TensorInfo<IndexType>& index, IndexType* indexOffset,
-      const TensorInfo<IndexType>& t1, IndexType* t1Offset,
-      const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+      const TensorInfo<float, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<float, IndexType>& t1, IndexType* t1Offset,
+      const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) {
     for (int d = index.dims - 1; d >= 0; d--) {
       IndexType curDimIndex = linearId % index.sizes[d];
       *indexOffset += curDimIndex * index.strides[d];
@@ -61,8 +60,8 @@ struct IndexToScatterGatherOffsets<IndexType, -1> {
 
   static __device__ void compute(
       IndexType linearId, const int dim,
-      const TensorInfo<IndexType>& index, IndexType* indexOffset,
-      const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+      const TensorInfo<float, IndexType>& index, IndexType* indexOffset,
+      const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) {
     for (int d = index.dims - 1; d >= 0; d--) {
       IndexType curDimIndex = linearId % index.sizes[d];
       *indexOffset += curDimIndex * index.strides[d];
@@ -77,9 +76,9 @@ struct IndexToScatterGatherOffsets<IndexType, -1> {
 
 template <typename IndexType, int Dims>
 __global__ void THCudaTensor_gatherKernel(
-    TensorInfo<IndexType> tensor,
-    TensorInfo<IndexType> src,
-    TensorInfo<IndexType> index,
+    TensorInfo<float, IndexType> tensor,
+    TensorInfo<float, IndexType> src,
+    TensorInfo<float, IndexType> index,
     const int dim,
     const IndexType totalElements) {
   for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
@@ -135,17 +134,20 @@ void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *sr
   THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
 
   THCudaTensor* oldTensor = NULL;
-  if (THC_overlappingIndices(state, tensor)) {
+  if (TensorUtils<THCudaTensor>::overlappingIndices(state, tensor)) {
     oldTensor = tensor;
     tensor = THCudaTensor_newContiguous(state, tensor);
   }
 
-  if (THC_canUse32BitIndexMath(state, tensor) &&
-      THC_canUse32BitIndexMath(state, src) &&
-      THC_canUse32BitIndexMath(state, index)) {
-    TensorInfo<unsigned int> tensorInfo(state, tensor);
-    TensorInfo<unsigned int> srcInfo(state, src);
-    TensorInfo<unsigned int> indexInfo(state, index);
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, tensor) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, index)) {
+    TensorInfo<float, unsigned int> tensorInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, tensor);
+    TensorInfo<float, unsigned int> srcInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, src);
+    TensorInfo<float, unsigned int> indexInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, index);
 
     // Specialize for a small number of dimensions.
     switch (indexInfo.dims) {
@@ -163,15 +165,18 @@ void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *sr
         break;
     }
   } else {
-    TensorInfo<unsigned long> tensorInfo(state, tensor);
-    TensorInfo<unsigned long> srcInfo(state, src);
-    TensorInfo<unsigned long> indexInfo(state, index);
+    TensorInfo<float, unsigned long> tensorInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, tensor);
+    TensorInfo<float, unsigned long> srcInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, src);
+    TensorInfo<float, unsigned long> indexInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, index);
 
     RUN(unsigned long, -1)
   }
 
   if (oldTensor) {
-    THCudaTensor_copyIgnoringOverlaps(state, oldTensor, tensor);
+    TensorUtils<THCudaTensor>::copyIgnoringOverlaps(state, oldTensor, tensor);
     THCudaTensor_free(state, tensor);
     tensor = oldTensor;
   }
@@ -182,9 +187,9 @@ void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *sr
 
 template <typename IndexType, int Dims>
 __global__ void THCudaTensor_scatterKernel(
-    TensorInfo<IndexType> tensor,
-    TensorInfo<IndexType> src,
-    TensorInfo<IndexType> index,
+    TensorInfo<float, IndexType> tensor,
+    TensorInfo<float, IndexType> src,
+    TensorInfo<float, IndexType> index,
     const int dim,
     const IndexType totalElements) {
   for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
@@ -230,7 +235,7 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda
     }
   }
 
-  THArgCheck(THCudaTensor_nDimension(state, tensor) <= MAX_CUTORCH_DIMS, 
+  THArgCheck(THCudaTensor_nDimension(state, tensor) <= MAX_CUTORCH_DIMS,
              1, CUTORCH_DIM_WARNING);
 
   const long totalElements = THCudaTensor_nElement(state, index);
@@ -239,17 +244,20 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda
   THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
 
   THCudaTensor* oldTensor = NULL;
-  if (THC_overlappingIndices(state, tensor)) {
+  if (TensorUtils<THCudaTensor>::overlappingIndices(state, tensor)) {
     oldTensor = tensor;
     tensor = THCudaTensor_newContiguous(state, tensor);
   }
 
-  if (THC_canUse32BitIndexMath(state, tensor) &&
-      THC_canUse32BitIndexMath(state, src) &&
-      THC_canUse32BitIndexMath(state, index)) {
-    TensorInfo<unsigned int> tensorInfo(state, tensor);
-    TensorInfo<unsigned int> srcInfo(state, src);
-    TensorInfo<unsigned int> indexInfo(state, index);
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, tensor) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, index)) {
+    TensorInfo<float, unsigned int> tensorInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, tensor);
+    TensorInfo<float, unsigned int> srcInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, src);
+    TensorInfo<float, unsigned int> indexInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, index);
 
     // Specialize for a small number of dimensions.
     switch (indexInfo.dims) {
@@ -267,15 +275,18 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda
         break;
     }
   } else {
-    TensorInfo<unsigned long> tensorInfo(state, tensor);
-    TensorInfo<unsigned long> srcInfo(state, src);
-    TensorInfo<unsigned long> indexInfo(state, index);
+    TensorInfo<float, unsigned long> tensorInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, tensor);
+    TensorInfo<float, unsigned long> srcInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, src);
+    TensorInfo<float, unsigned long> indexInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, index);
 
     RUN(unsigned long, -1)
   }
 
   if (oldTensor) {
-    THCudaTensor_copyIgnoringOverlaps(state, oldTensor, tensor);
+    TensorUtils<THCudaTensor>::copyIgnoringOverlaps(state, oldTensor, tensor);
     THCudaTensor_free(state, tensor);
     tensor = oldTensor;
   }
@@ -286,8 +297,8 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda
 
 template <typename IndexType, int Dims>
 __global__ void THCudaTensor_scatterFillKernel(
-    TensorInfo<IndexType> tensor,
-    TensorInfo<IndexType> index,
+    TensorInfo<float, IndexType> tensor,
+    TensorInfo<float, IndexType> index,
     float value,
     const int dim,
     const IndexType totalElements) {
@@ -313,17 +324,21 @@ __global__ void THCudaTensor_scatterFillKernel(
       <<<grid, block, 0, THCState_getCurrentStream(state)>>>(      \
           tensorInfo, indexInfo, value, dim, (TYPE)totalElements);
 
-void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, THCudaTensor *index, float value) {
+void
+THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor,
+                         int dim, THCudaTensor *index, float value) {
   THAssert(THCudaTensor_checkGPU(state, 2, tensor, index));
 
   THArgCheck(dim >= 0 && dim < THCudaTensor_nDimension(state, tensor), 2,
              "Index dimension is out of bounds");
-  THArgCheck(THCudaTensor_nDimension(state, index) == THCudaTensor_nDimension(state, tensor), 3,
+  THArgCheck(THCudaTensor_nDimension(state, index) ==
+             THCudaTensor_nDimension(state, tensor), 3,
              "Index tensor must have same dimensions as output tensor");
 
   for (int d = 0; d < THCudaTensor_nDimension(state, tensor); d++) {
     if (d != dim) {
-      THArgCheck(THCudaTensor_size(state, tensor, d) == THCudaTensor_size(state, index, d), 4,
+      THArgCheck(THCudaTensor_size(state, tensor, d) ==
+                 THCudaTensor_size(state, index, d), 4,
                  "Index tensor must have same size as output tensor apart from the specified dimension");
     }
   }
@@ -337,15 +352,17 @@ void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, TH
   THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
 
   THCudaTensor* oldTensor = NULL;
-  if (THC_overlappingIndices(state, tensor)) {
+  if (TensorUtils<THCudaTensor>::overlappingIndices(state, tensor)) {
     oldTensor = tensor;
     tensor = THCudaTensor_newContiguous(state, tensor);
   }
 
-  if (THC_canUse32BitIndexMath(state, tensor) &&
-      THC_canUse32BitIndexMath(state, index)) {
-    TensorInfo<unsigned int> tensorInfo(state, tensor);
-    TensorInfo<unsigned int> indexInfo(state, index);
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, tensor) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, index)) {
+    TensorInfo<float, unsigned int> tensorInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, tensor);
+    TensorInfo<float, unsigned int> indexInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, index);
 
     // Specialize for a small number of dimensions.
     switch (indexInfo.dims) {
@@ -363,14 +380,16 @@ void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, TH
         break;
     }
   } else {
-    TensorInfo<unsigned long> tensorInfo(state, tensor);
-    TensorInfo<unsigned long> indexInfo(state, index);
+    TensorInfo<float, unsigned long> tensorInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, tensor);
+    TensorInfo<float, unsigned long> indexInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, index);
 
     RUN(unsigned long, -1);
   }
 
   if (oldTensor) {
-    THCudaTensor_copyIgnoringOverlaps(state, oldTensor, tensor);
+    TensorUtils<THCudaTensor>::copyIgnoringOverlaps(state, oldTensor, tensor);
     THCudaTensor_free(state, tensor);
     tensor = oldTensor;
   }
diff --git a/lib/THC/THCTensorSort.cu b/lib/THC/THCTensorSort.cu
index c35a36a..3b9562d 100644
--- a/lib/THC/THCTensorSort.cu
+++ b/lib/THC/THCTensorSort.cu
@@ -1,6 +1,7 @@
 #include "THCReduceApplyUtils.cuh"
 #include "THCSortUtils.cuh"
 #include "THCTensorCopy.h"
+#include "THCTensorTypeUtils.cuh"
 
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
@@ -28,7 +29,7 @@ unsigned long nextHighestPowerOf2(unsigned long n) {
 // `sliceSize - 1`.
 template <typename IndexType, int Dim>
 __global__ void
-fillSliceWithIndex(TensorInfo<IndexType> out,
+fillSliceWithIndex(TensorInfo<float, IndexType> out,
                    IndexType totalSlices,
                    IndexType sliceSize,
                    IndexType sliceStride) {
@@ -39,7 +40,7 @@ fillSliceWithIndex(TensorInfo<IndexType> out,
   }
 
   const unsigned long offset =
-    IndexToOffset<IndexType, Dim>::get(slice, out);
+    IndexToOffset<float, IndexType, Dim>::get(slice, out);
   float* base = &out.data[offset];
 
   for (long i = threadIdx.x; i < sliceSize; i += blockDim.x) {
@@ -76,9 +77,10 @@ void THCudaTensor_fillSliceWithIndex(THCState* state,
     <<<grid, block, 0, THCState_getCurrentStream(state)>>>(      \
       info, numSlices, sliceSize, info.strides[collapseDim])
 
-  if (THC_canUse32BitIndexMath(state, t)) {
-    TensorInfo<unsigned int> info(state, t, dim);
-    info.sizes[dim] = 1;
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, t)) {
+    TensorInfo<float, unsigned int> info =
+      getTensorInfo<THCudaTensor, unsigned int>(state, t);
+    info.reduceDim(dim);
     int collapseDim = info.collapseDims(dim);
 
     if (info.isContiguous()) {
@@ -93,8 +95,9 @@ void THCudaTensor_fillSliceWithIndex(THCState* state,
       }
     }
   } else {
-    TensorInfo<unsigned long> info(state, t, dim);
-    info.sizes[dim] = 1;
+    TensorInfo<float, unsigned long> info =
+      getTensorInfo<THCudaTensor, unsigned long>(state, t);
+    info.reduceDim(dim);
     int collapseDim = info.collapseDims(dim);
 
     // catch-all implementation
@@ -221,13 +224,15 @@ THC_API void THCudaTensor_sortKeyValueInplace(THCState* state,
 
   // The constructed key/value tensor info is used to select the slice
   // we are sorting on a per-block basis
-  if (THC_canUse32BitIndexMath(state, key)) {
-    TensorInfo<unsigned int> keyInfo(state, key);
-    keyInfo.sizes[dim] = 1;
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, key)) {
+    TensorInfo<float, unsigned int> keyInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, key);
+    keyInfo.reduceDim(dim);
     int collapseKeyDim = keyInfo.collapseDims(dim);
 
-    TensorInfo<unsigned int> valueInfo(state, value);
-    valueInfo.sizes[dim] = 1;
+    TensorInfo<float, unsigned int> valueInfo =
+      getTensorInfo<THCudaTensor, unsigned int>(state, value);
+    valueInfo.reduceDim(dim);
     int collapseValueDim = valueInfo.collapseDims(dim);
 
     if (keyInfo.isContiguous()) {
@@ -246,12 +251,14 @@ THC_API void THCudaTensor_sortKeyValueInplace(THCState* state,
       }
     }
   } else {
-    TensorInfo<unsigned long> keyInfo(state, key);
-    keyInfo.sizes[dim] = 1;
+    TensorInfo<float, unsigned long> keyInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, key);
+    keyInfo.reduceDim(dim);
     int collapseKeyDim = keyInfo.collapseDims(dim);
 
-    TensorInfo<unsigned long> valueInfo(state, value);
-    valueInfo.sizes[dim] = 1;
+    TensorInfo<float, unsigned long> valueInfo =
+      getTensorInfo<THCudaTensor, unsigned long>(state, value);
+    valueInfo.reduceDim(dim);
     int collapseValueDim = valueInfo.collapseDims(dim);
 
     // long case is rare, just instantiate these versions
diff --git a/lib/THC/THCTensorTopK.cu b/lib/THC/THCTensorTopK.cu
index a8c189f..582143e 100644
--- a/lib/THC/THCTensorTopK.cu
+++ b/lib/THC/THCTensorTopK.cu
@@ -3,6 +3,7 @@
 #include "THCTensorSort.h"
 #include "THCAsmUtils.cuh"
 #include "THCScanUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
 #include <algorithm> // for std::min
 
 #if CUDA_VERSION >= 7000
@@ -248,18 +249,18 @@ __device__ void radixSelect(const RadixConverter& conv,
 }
 
 template <typename IndexType, int Dim, bool Order>
-__global__ void gatherTopK(TensorInfo<IndexType> input,
+__global__ void gatherTopK(TensorInfo<float, IndexType> input,
                            IndexType inputSliceSize,
                            IndexType outputSliceSize, // aka `k`
 
                            IndexType numInputSlices,
                            IndexType inputWithinSliceStride,
 
-                           TensorInfo<IndexType> topK,
+                           TensorInfo<float, IndexType> topK,
                            IndexType numTopKSlices,
                            IndexType topKWithinSliceStride,
 
-                           TensorInfo<IndexType> indices,
+                           TensorInfo<float, IndexType> indices,
                            IndexType indicesWithinSliceStride) {
   // Indices are limited to integer fp precision, so counts can fit in
   // int32, regardless of IndexType
@@ -272,11 +273,11 @@ __global__ void gatherTopK(TensorInfo<IndexType> input,
 
   // Find the start offset for our slice
   IndexType sliceStartIndex =
-    IndexToOffset<IndexType, Dim>::get(slice, input);
+    IndexToOffset<float, IndexType, Dim>::get(slice, input);
   IndexType topKSliceStartIndex =
-    IndexToOffset<IndexType, Dim>::get(slice, topK);
+    IndexToOffset<float, IndexType, Dim>::get(slice, topK);
   IndexType indicesSliceStartIndex =
-    IndexToOffset<IndexType, Dim>::get(slice, indices);
+    IndexToOffset<float, IndexType, Dim>::get(slice, indices);
 
   float* inputSliceStart = &input.data[sliceStartIndex];
   float* topKSliceStart = &topK.data[topKSliceStartIndex];
@@ -414,8 +415,8 @@ THC_API void THCudaTensor_topk(THCState* state,
   THCudaTensor_resize(state, indices, topKSize, NULL);
   THLongStorage_free(topKSize);
 
-#define RUN_K(T, DIM, DIR)                                              \
-  gatherTopK<T, DIM, DIR>                                               \
+#define RUN_K(INDEX_T, DIM, DIR)                                        \
+  gatherTopK<INDEX_T, DIM, DIR>                                         \
     <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
       inputInfo,                                                        \
       sliceSize,                                                        \
@@ -430,70 +431,73 @@ THC_API void THCudaTensor_topk(THCState* state,
       indicesInfo,                                                      \
       indicesInfo.strides[collapseIndicesDim])
 
-#define RUN_DIR(T, DIM)                         \
+#define RUN_DIR(INDEX_T, DIM)                   \
   if (dir) {                                    \
-    RUN_K(T, DIM, true);                        \
+    RUN_K(INDEX_T, DIM, true);                  \
   } else {                                      \
-    RUN_K(T, DIM, false);                       \
+    RUN_K(INDEX_T, DIM, false);                 \
   }
 
-#define RUN_DIM(T)                              \
+#define RUN_DIM(INDEX_T)                        \
   if (allDims == 1) {                           \
-    RUN_DIR(T, 1);                              \
+    RUN_DIR(INDEX_T, 1);                        \
   } else if (allDims == 2) {                    \
-    RUN_DIR(T, 2);                              \
+    RUN_DIR(INDEX_T, 2);                        \
   } else if (allDims == 3) {                    \
-    RUN_DIR(T, 3);                              \
+    RUN_DIR(INDEX_T, 3);                        \
   } else {                                      \
-    RUN_DIR(T, -1);                             \
+    RUN_DIR(INDEX_T, -1);                       \
   }
 
-#define RUN_T(T)                                                        \
-    TensorInfo<T> inputInfo(state, input);                              \
-    TensorInfo<T> topKInfo(state, topK);                                \
-    TensorInfo<T> indicesInfo(state, indices);                          \
+#define RUN_T(INDEX_T)                                                  \
+  TensorInfo<float, INDEX_T> inputInfo =                                \
+    getTensorInfo<THCudaTensor, INDEX_T>(state, input);                 \
+  TensorInfo<float, INDEX_T> topKInfo =                                 \
+    getTensorInfo<THCudaTensor, INDEX_T>(state, topK);                  \
+  TensorInfo<float, INDEX_T> indicesInfo =                              \
+    getTensorInfo<THCudaTensor, INDEX_T>(state, indices);               \
                                                                         \
-    /* We use these structures solely to find the offset to */          \
-    /* each slice we are operating on */                                \
-    inputInfo.sizes[dim] = 1;                                           \
-    topKInfo.sizes[dim] = 1;                                            \
-    indicesInfo.sizes[dim] = 1;                                         \
+  /* We use these structures solely to find the offset to */            \
+  /* each slice we are operating on */                                  \
+  inputInfo.sizes[dim] = 1;                                             \
+  topKInfo.sizes[dim] = 1;                                              \
+  indicesInfo.sizes[dim] = 1;                                           \
                                                                         \
-    /* Collapse all other dims */                                       \
-    int collapseInputDim = inputInfo.collapseDims(dim);                 \
-    int collapseTopKDim = topKInfo.collapseDims(dim);                   \
-    int collapseIndicesDim = indicesInfo.collapseDims(dim);             \
+  /* Collapse all other dims */                                         \
+  int collapseInputDim = inputInfo.collapseDims(dim);                   \
+  int collapseTopKDim = topKInfo.collapseDims(dim);                     \
+  int collapseIndicesDim = indicesInfo.collapseDims(dim);               \
                                                                         \
-    long inputSlices = 1;                                               \
-    long topKSlices = 1;                                                \
-    for (int i = 0; i < numDims; ++i) {                                 \
-      inputSlices *= inputInfo.sizes[i];                                \
-      topKSlices *= topKInfo.sizes[i];                                  \
-    }                                                                   \
+  long inputSlices = 1;                                                 \
+  long topKSlices = 1;                                                  \
+  for (int i = 0; i < numDims; ++i) {                                   \
+    inputSlices *= inputInfo.sizes[i];                                  \
+    topKSlices *= topKInfo.sizes[i];                                    \
+  }                                                                     \
                                                                         \
-    dim3 grid;                                                          \
-    if (!THC_getGridFromTiles(inputSlices, grid)) {                     \
-      THError("Slice to sort is too large");                            \
-    }                                                                   \
+  dim3 grid;                                                            \
+  if (!THC_getGridFromTiles(inputSlices, grid)) {                       \
+    THError("Slice to sort is too large");                              \
+  }                                                                     \
                                                                         \
-    dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L));            \
+  dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L));              \
                                                                         \
-    /* This is used as a template parameter to calculate indices. */    \
-    /* We only specialize it if all collapsed dim sizes are the */      \
-    /* same; otherwise, we use -1 which is the specialization */        \
-    /* parameter for arbitrary dimensions */                            \
-    int allDims = inputInfo.dims;                                       \
-    if (topKInfo.dims != allDims || indicesInfo.dims != allDims) {      \
-      allDims = -1;                                                     \
-    }                                                                   \
+  /* This is used as a template parameter to calculate indices. */      \
+  /* We only specialize it if all collapsed dim sizes are the */        \
+  /* same; otherwise, we use -1 which is the specialization */          \
+  /* parameter for arbitrary dimensions */                              \
+  int allDims = inputInfo.dims;                                         \
+  if (topKInfo.dims != allDims || indicesInfo.dims != allDims) {        \
+    allDims = -1;                                                       \
+  }                                                                     \
                                                                         \
-    RUN_DIM(T);
+  RUN_DIM(INDEX_T);
 
   // Based on required index size, run the algorithm with the
   // appropriate index type
-  if (THC_canUse32BitIndexMath(state, input) &&
-      THC_canUse32BitIndexMath(state, topK) &&
-      THC_canUse32BitIndexMath(state, indices)) {
+  if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, topK) &&
+      TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) {
     RUN_T(unsigned int);
   } else {
     RUN_T(unsigned long);
diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu
new file mode 100644
index 0000000..fc9bd60
--- /dev/null
+++ b/lib/THC/THCTensorTypeUtils.cu
@@ -0,0 +1,210 @@
+#include "THCTensorTypeUtils.cuh"
+#include "THCTensor.h"
+#include "THCTensorCopy.h"
+#include "THCHalf.h"
+#include <stdlib.h>
+
+namespace {
+
+struct SizeAndStride {
+  long size;
+  long stride;
+};
+
+int compareSizeAndStride(const void* a, const void* b) {
+  const SizeAndStride* aS = (const SizeAndStride*) a;
+  const SizeAndStride* bS = (const SizeAndStride*) b;
+
+  return aS->stride < bS->stride;
+}
+
+}
+
+#define IMPL_TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE)                       \
+                                                                        \
+TENSOR_TYPE*                                                            \
+TensorUtils<TENSOR_TYPE>::newTensor(THCState* state) {                  \
+  return TENSOR_TYPE##_new(state);                                      \
+}                                                                       \
+                                                                        \
+TENSOR_TYPE*                                                            \
+TensorUtils<TENSOR_TYPE>::newContiguous(THCState* state,                \
+                                        TENSOR_TYPE* t) {               \
+  return TENSOR_TYPE##_newContiguous(state, t);                         \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::retain(THCState* state,                       \
+                                 TENSOR_TYPE* t) {                      \
+  TENSOR_TYPE##_retain(state, t);                                       \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::free(THCState* state,                         \
+                               TENSOR_TYPE* t) {                        \
+  TENSOR_TYPE##_free(state, t);                                         \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::freeCopyTo(THCState* state,                   \
+                                     TENSOR_TYPE* src,                  \
+                                     TENSOR_TYPE* dst) {                \
+  TENSOR_TYPE##_freeCopyTo(state, src, dst);                            \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::resizeAs(THCState* state,                     \
+                                   TENSOR_TYPE* dst,                    \
+                                   TENSOR_TYPE* src) {                  \
+  TENSOR_TYPE##_resizeAs(state, dst, src);                              \
+}                                                                       \
+                                                                        \
+DATA_TYPE*                                                              \
+TensorUtils<TENSOR_TYPE>::getData(THCState* state,                      \
+                                  TENSOR_TYPE* t) {                     \
+  /* FIXME: no cast is required except for THCudaHalfTensor */          \
+  return (DATA_TYPE*) TENSOR_TYPE##_data(state, t);                     \
+}                                                                       \
+                                                                        \
+long                                                                    \
+TensorUtils<TENSOR_TYPE>::getNumElements(THCState* state,               \
+                                         TENSOR_TYPE* t) {              \
+  return TENSOR_TYPE##_nElement(state, t);                              \
+}                                                                       \
+                                                                        \
+long                                                                    \
+TensorUtils<TENSOR_TYPE>::getSize(THCState* state,                      \
+                                  TENSOR_TYPE* t,                       \
+                                  int dim) {                            \
+  return TENSOR_TYPE##_size(state, t, dim);                             \
+}                                                                       \
+                                                                        \
+long                                                                    \
+TensorUtils<TENSOR_TYPE>::getStride(THCState* state,                    \
+                                    TENSOR_TYPE* t,                     \
+                                    int dim) {                          \
+  return TENSOR_TYPE##_stride(state, t, dim);                           \
+}                                                                       \
+                                                                        \
+int                                                                     \
+TensorUtils<TENSOR_TYPE>::getDims(THCState* state,                      \
+                                  TENSOR_TYPE* t) {                     \
+  return TENSOR_TYPE##_nDimension(state, t);                            \
+}                                                                       \
+                                                                        \
+bool                                                                    \
+TensorUtils<TENSOR_TYPE>::isContiguous(THCState* state,                 \
+                                       TENSOR_TYPE* t) {                \
+  return TENSOR_TYPE##_isContiguous(state, t);                          \
+}                                                                       \
+                                                                        \
+int                                                                     \
+TensorUtils<TENSOR_TYPE>::getDevice(THCState* state,                    \
+                                    TENSOR_TYPE* t) {                   \
+  return TENSOR_TYPE##_getDevice(state, t);                             \
+}                                                                       \
+                                                                        \
+void                                                                    \
+TensorUtils<TENSOR_TYPE>::copyIgnoringOverlaps(THCState* state,         \
+                                               TENSOR_TYPE* dst,        \
+                                               TENSOR_TYPE* src) {      \
+  return TENSOR_TYPE##_copyIgnoringOverlaps(state, dst, src);           \
+}                                                                       \
+                                                                        \
+bool                                                                    \
+TensorUtils<TENSOR_TYPE>::overlappingIndices(THCState* state,           \
+                                             TENSOR_TYPE* t) {          \
+  /* In this function, we don't care about permutations of the */       \
+  /* size/stride arrays (transpositions). */                            \
+  /* We order the size/stride arrays by stride, skipping dimensions of */ \
+  /* size 1. Strides of dimensions of size 1 don't matter, since there */ \
+  /* is only one addressing point in them. */                           \
+  /* In this reordered view, the tensor is contiguous if */             \
+  /* stride[dim] == size[dim + 1] * stride[dim + 1] for all `dim`. */   \
+  /* The tensor has holes if */                                         \
+  /* stride[dim] > size[dim + 1] * stride[dim + 1] for one or more */   \
+  /* `dim`. */                                                          \
+  /* The tensor has overlaps if */                                      \
+  /* stride[dim] < size[dim + 1] * stride[dim + 1] for one or more */   \
+  /* `dim`, or the innermost stride is 0. */                            \
+                                                                        \
+  /* Extract size/stride arrays; only consider size >1 dims. */         \
+  SizeAndStride info[MAX_CUTORCH_DIMS];                                 \
+                                                                        \
+  int dims = TensorUtils<TENSOR_TYPE>::getDims(state, t);               \
+  int nonSize1Dims = 0;                                                 \
+  for (int i = 0; i < dims; ++i) {                                      \
+    long size = TensorUtils<TENSOR_TYPE>::getSize(state, t, i);         \
+    if (size > 1) {                                                     \
+      info[nonSize1Dims].size = size;                                   \
+      info[nonSize1Dims].stride =                                       \
+        TensorUtils<TENSOR_TYPE>::getStride(state, t, i);               \
+      ++nonSize1Dims;                                                   \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  if (nonSize1Dims == 0) {                                              \
+    /* no overlap */                                                    \
+    return false;                                                       \
+  }                                                                     \
+                                                                        \
+  /* Ascending order (innermost dimension in sorted view is at [0]) */  \
+  qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride); \
+                                                                        \
+  /* Base case: innermost dimension must have stride >= 1 */            \
+  if (info[nonSize1Dims - 1].stride < 1) {                              \
+    return true;                                                        \
+  }                                                                     \
+                                                                        \
+  /* Subsequent dimensions, if any */                                   \
+  for (int i = nonSize1Dims - 2; i >= 0; --i) {                         \
+    if (info[i].stride < info[i + 1].size * info[i + 1].stride) {       \
+      /* There are overlaps */                                          \
+      return true;                                                      \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  /* Tensor has holes or is contiguous */                               \
+  return false;                                                         \
+}                                                                       \
+                                                                        \
+bool                                                                    \
+TensorUtils<TENSOR_TYPE>::canUse32BitIndexMath(THCState* state,         \
+                                               TENSOR_TYPE* t) {        \
+  long elements = TensorUtils<TENSOR_TYPE>::getNumElements(state, t);   \
+  if (elements >= UINT_MAX) {                                           \
+    return false;                                                       \
+  }                                                                     \
+                                                                        \
+  long offset = 0;                                                      \
+  long linearId = elements - 1;                                         \
+                                                                        \
+  for (int i = TensorUtils<TENSOR_TYPE>::getDims(state, t) - 1; i >= 0; --i) { \
+    long curDimIndex =                                                  \
+      linearId % TensorUtils<TENSOR_TYPE>::getSize(state, t, i);        \
+    long curDimOffset = curDimIndex *                                   \
+      TensorUtils<TENSOR_TYPE>::getStride(state, t, i);                 \
+    offset += curDimOffset;                                             \
+    linearId /= TensorUtils<TENSOR_TYPE>::getSize(state, t, i);         \
+  }                                                                     \
+                                                                        \
+  if (offset >= UINT_MAX) {                                             \
+    return false;                                                       \
+  }                                                                     \
+                                                                        \
+  return true;                                                          \
+}
+
+IMPL_TENSOR_UTILS(THCudaByteTensor, unsigned char)
+IMPL_TENSOR_UTILS(THCudaCharTensor, char)
+IMPL_TENSOR_UTILS(THCudaShortTensor, short)
+IMPL_TENSOR_UTILS(THCudaIntTensor, int)
+IMPL_TENSOR_UTILS(THCudaLongTensor, long)
+IMPL_TENSOR_UTILS(THCudaTensor, float)
+IMPL_TENSOR_UTILS(THCudaDoubleTensor, double)
+
+#ifdef CUDA_HALF_TENSOR
+IMPL_TENSOR_UTILS(THCudaHalfTensor, half);
+#endif
+
+#undef IMPL_TENSOR_UTILS
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
new file mode 100644
index 0000000..4456f47
--- /dev/null
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -0,0 +1,184 @@
+#ifndef THC_TENSOR_TYPE_UTILS_INC
+#define THC_TENSOR_TYPE_UTILS_INC
+
+#include <cuda.h>
+#include <assert.h>
+#include "THCGeneral.h"
+#include "THCHalf.h"
+#include "THCTensor.h"
+#include "THCTensorInfo.cuh"
+
+/// A utility for accessing THCuda*Tensor types in a generic manner
+
+/// Equivalent to C++11's type_traits std::is_same; used for comparing
+/// equality of types. Don't assume the existence of C++11
+template <typename T, typename U>
+struct SameType {
+  static const bool same = false;
+};
+
+template <typename T>
+struct SameType<T, T> {
+  static const bool same = true;
+};
+
+template <typename T, typename U>
+bool isSameType() {
+  return SameType<T, U>::same;
+}
+
+template <typename TensorType>
+struct TensorUtils {
+};
+
+#define TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE)                            \
+  template <>                                                           \
+  struct TensorUtils<TENSOR_TYPE> {                                     \
+    typedef DATA_TYPE DataType;                                         \
+                                                                        \
+    static TENSOR_TYPE* newTensor(THCState* state);                     \
+    static TENSOR_TYPE* newContiguous(THCState* state, TENSOR_TYPE* t); \
+    static void retain(THCState* state, TENSOR_TYPE* t);                \
+    static void free(THCState* state, TENSOR_TYPE* t);                  \
+    static void freeCopyTo(THCState* state, TENSOR_TYPE* src,           \
+                           TENSOR_TYPE* dst);                           \
+    static void resizeAs(THCState* state, TENSOR_TYPE* dst,             \
+                         TENSOR_TYPE* src);                             \
+    static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t);         \
+    static long getNumElements(THCState* state, TENSOR_TYPE* t);        \
+    static long getSize(THCState* state, TENSOR_TYPE* t, int dim);      \
+    static long getStride(THCState* state, TENSOR_TYPE* t, int dim);    \
+    static int getDims(THCState* state, TENSOR_TYPE* t);                \
+    static bool isContiguous(THCState* state, TENSOR_TYPE* t);          \
+    static int getDevice(THCState* state, TENSOR_TYPE* t);              \
+    static void copyIgnoringOverlaps(THCState* state,                   \
+                                     TENSOR_TYPE* dst, TENSOR_TYPE* src); \
+    /* Determines if the given tensor has overlapping data points (i.e., */ \
+    /* is there more than one index into the tensor that references */  \
+    /* the same piece of data)? */                                      \
+    static bool overlappingIndices(THCState* state, TENSOR_TYPE* t);    \
+    /* Can we use 32 bit math for indexing? */                          \
+    static bool canUse32BitIndexMath(THCState* state, TENSOR_TYPE* t);  \
+  }
+
+TENSOR_UTILS(THCudaByteTensor, unsigned char);
+TENSOR_UTILS(THCudaCharTensor, char);
+TENSOR_UTILS(THCudaShortTensor, short);
+TENSOR_UTILS(THCudaIntTensor, int);
+TENSOR_UTILS(THCudaLongTensor, long);
+TENSOR_UTILS(THCudaTensor, float);
+TENSOR_UTILS(THCudaDoubleTensor, double);
+
+#ifdef CUDA_HALF_TENSOR
+TENSOR_UTILS(THCudaHalfTensor, half);
+#endif
+
+#undef TENSOR_UTILS
+
+template <typename TensorType, typename IndexType>
+TensorInfo<typename TensorUtils<TensorType>::DataType, IndexType>
+getTensorInfo(THCState* state, TensorType* t) {
+  IndexType sz[MAX_CUTORCH_DIMS];
+  IndexType st[MAX_CUTORCH_DIMS];
+
+  int dims = TensorUtils<TensorType>::getDims(state, t);
+  for (int i = 0; i < dims; ++i) {
+    sz[i] = TensorUtils<TensorType>::getSize(state, t, i);
+    st[i] = TensorUtils<TensorType>::getStride(state, t, i);
+  }
+
+  return TensorInfo<typename TensorUtils<TensorType>::DataType, IndexType>(
+    TensorUtils<TensorType>::getData(state, t), dims, sz, st);
+}
+
+/// `half` has some type conversion issues associated with it, since it
+/// is a struct without a constructor/implicit conversion constructor.
+/// We use this to convert scalar values to the given type that the
+/// tensor expects.
+template <typename In, typename Out>
+struct ScalarConvert {
+  static __host__ __device__ Out to(const In v) { return (Out) v; }
+};
+
+template <typename T>
+struct ScalarNegate {
+  static __host__ __device__ T to(const T v) { return -v; }
+};
+
+template <typename T>
+struct ScalarInv {
+  static __host__ __device__ T to(const T v) { return ((T) 1) / v; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+
+template <typename Out>
+struct ScalarConvert<half, Out> {
+  static __host__ __device__ Out to(const half v) {
+#ifdef __CUDA_ARCH__
+    return (Out) __half2float(v);
+#else
+    return (Out) THC_half2float(v);
+#endif
+  }
+};
+
+template <typename In>
+struct ScalarConvert<In, half> {
+  static __host__ __device__ half to(const In v) {
+#ifdef __CUDA_ARCH__
+    return __float2half((float) v);
+#else
+    return THC_float2half((float) v);
+#endif
+  }
+};
+
+template <>
+struct ScalarConvert<half, half> {
+  static __host__ __device__ half to(const half v) {
+    return v;
+  }
+};
+
+template <>
+struct ScalarNegate<half> {
+  static __host__ __device__ half to(const half v) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hneg(v);
+#else
+    return __float2half(-__half2float(v));
+#endif
+#else
+    half out = v;
+    out.x ^= 0x8000; // toggle sign bit
+    return out;
+#endif
+  }
+};
+
+template <>
+struct ScalarInv<half> {
+  static __host__ __device__ half to(const half v) {
+#ifdef __CUDA_ARCH__
+    return __float2half(1.0f / __half2float(v));
+#else
+    float fv = THC_half2float(v);
+    fv = 1.0f / fv;
+    return THC_float2half(fv);
+#endif
+  }
+};
+
+inline bool operator==(half a, half b) {
+  return a.x == b.x;
+}
+
+inline bool operator!=(half a, half b) {
+  return a.x != b.x;
+}
+
+#endif // CUDA_HALF_TENSOR
+
+#endif // THC_TENSOR_TYPE_UTILS_INC
diff --git a/lib/THC/generic/THCStorage.c b/lib/THC/generic/THCStorage.c
index d8fec77..61ba125 100644
--- a/lib/THC/generic/THCStorage.c
+++ b/lib/THC/generic/THCStorage.c
@@ -17,29 +17,18 @@ int THCStorage_(elementSize)(THCState *state)
   return sizeof(real);
 }
 
-void THCStorage_(set)(THCState *state, THCStorage *self, long index, hostreal _value)
+void THCStorage_(set)(THCState *state, THCStorage *self, long index, real value)
 {
   THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
-  real value = hostrealToReal(_value);
   THCudaCheck(cudaMemcpy(self->data + index, &value, sizeof(real), cudaMemcpyHostToDevice));
 }
 
-hostreal THCStorage_(get)(THCState *state, const THCStorage *self, long index)
+real THCStorage_(get)(THCState *state, const THCStorage *self, long index)
 {
   THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
-#ifndef THC_REAL_IS_HALF
   real value;
   THCudaCheck(cudaMemcpy(&value, self->data + index, sizeof(real), cudaMemcpyDeviceToHost));
-  return realToHostreal(value);
-#else
-  float *ret_d;
-  float ret;
-  THCudaCheck(THCudaMalloc(state, (void**)&ret_d, sizeof(float)));
-  THCHalf2Float(state, ret_d, self->data + index, 1);
-  THCudaCheck(cudaMemcpy(&ret, ret_d, sizeof(float), cudaMemcpyDeviceToHost));
-  THCudaFree(state, ret_d);
-  return ret;
-#endif
+  return value;
 }
 
 THCStorage* THCStorage_(new)(THCState *state)
@@ -80,14 +69,14 @@ THCStorage* THCStorage_(newWithSize)(THCState *state, long size)
   }
 }
 
-THCStorage* THCStorage_(newWithSize1)(THCState *state, hostreal data0)
+THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0)
 {
   THCStorage *self = THCStorage_(newWithSize)(state, 1);
   THCStorage_(set)(state, self, 0, data0);
   return self;
 }
 
-THCStorage* THCStorage_(newWithSize2)(THCState *state, hostreal data0, hostreal data1)
+THCStorage* THCStorage_(newWithSize2)(THCState *state, real data0, real data1)
 {
   THCStorage *self = THCStorage_(newWithSize)(state, 2);
   THCStorage_(set)(state, self, 0, data0);
@@ -95,7 +84,7 @@ THCStorage* THCStorage_(newWithSize2)(THCState *state, hostreal data0, hostreal
   return self;
 }
 
-THCStorage* THCStorage_(newWithSize3)(THCState *state, hostreal data0, hostreal data1, hostreal data2)
+THCStorage* THCStorage_(newWithSize3)(THCState *state, real data0, real data1, real data2)
 {
   THCStorage *self = THCStorage_(newWithSize)(state, 3);
   THCStorage_(set)(state, self, 0, data0);
@@ -104,7 +93,7 @@ THCStorage* THCStorage_(newWithSize3)(THCState *state, hostreal data0, hostreal
   return self;
 }
 
-THCStorage* THCStorage_(newWithSize4)(THCState *state, hostreal data0, hostreal data1, hostreal data2, hostreal data3)
+THCStorage* THCStorage_(newWithSize4)(THCState *state, real data0, real data1, real data2, real data3)
 {
   THCStorage *self = THCStorage_(newWithSize)(state, 4);
   THCStorage_(set)(state, self, 0, data0);
diff --git a/lib/THC/generic/THCStorage.cu b/lib/THC/generic/THCStorage.cu
index ff0d3c9..17924f5 100644
--- a/lib/THC/generic/THCStorage.cu
+++ b/lib/THC/generic/THCStorage.cu
@@ -2,10 +2,9 @@
 #define THC_GENERIC_FILE "generic/THCStorage.cu"
 #else
 
-void THCStorage_(fill)(THCState *state, THCStorage *self, hostreal _value)
+void THCStorage_(fill)(THCState *state, THCStorage *self, real value)
 {
   thrust::device_ptr<real> self_data(self->data);
-  real value = hostrealToReal(_value);
   thrust::fill(
 #if CUDA_VERSION >= 7000
     thrust::cuda::par.on(THCState_getCurrentStream(state)),
diff --git a/lib/THC/generic/THCStorage.h b/lib/THC/generic/THCStorage.h
index f161d5f..a8c5f5f 100644
--- a/lib/THC/generic/THCStorage.h
+++ b/lib/THC/generic/THCStorage.h
@@ -23,15 +23,15 @@ THC_API long THCStorage_(size)(THCState *state, const THCStorage*);
 THC_API int THCStorage_(elementSize)(THCState *state);
 
 /* slow access -- checks everything */
-THC_API void THCStorage_(set)(THCState *state, THCStorage*, long, hostreal);
-THC_API hostreal THCStorage_(get)(THCState *state, const THCStorage*, long);
+THC_API void THCStorage_(set)(THCState *state, THCStorage*, long, real);
+THC_API real THCStorage_(get)(THCState *state, const THCStorage*, long);
 
 THC_API THCStorage* THCStorage_(new)(THCState *state);
 THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, long size);
-THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, hostreal);
-THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, hostreal, hostreal);
-THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, hostreal, hostreal, hostreal);
-THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, hostreal, hostreal, hostreal, hostreal);
+THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, real);
+THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, real, real);
+THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, real, real, real);
+THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, real, real, real, real);
 THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, long size, int shared);
 
 /* takes ownership of data */
@@ -49,6 +49,6 @@ THC_API void THCStorage_(retain)(THCState *state, THCStorage *storage);
 
 THC_API void THCStorage_(free)(THCState *state, THCStorage *storage);
 THC_API void THCStorage_(resize)(THCState *state, THCStorage *storage, long size);
-THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, hostreal value);
+THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, real value);
 
 #endif
diff --git a/lib/THC/generic/THCStorageCopy.cu b/lib/THC/generic/THCStorageCopy.cu
index b00c122..298f717 100644
--- a/lib/THC/generic/THCStorageCopy.cu
+++ b/lib/THC/generic/THCStorageCopy.cu
@@ -62,7 +62,7 @@ THC_CUDA_STORAGE_IMPLEMENT_COPY(Long,Long)
 THC_CUDA_STORAGE_IMPLEMENT_COPY(Float,)  // i.e. float
 THC_CUDA_STORAGE_IMPLEMENT_COPY(Double,Double)
 
-#if CUDA_VERSION >= 7050
+#ifdef CUDA_HALF_TENSOR
 #define FLOAT_COPY(TYPE) TH_CONCAT_3(TH, CReal, Storage_copyCudaFloat)
 void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaHalfStorage *src)
 {
@@ -77,7 +77,7 @@ void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaH
     }
 }
 #undef FLOAT_COPY
-#endif // CUDA_VERSION >= 7050
+#endif // CUDA_HALF_TENSOR
 
 #undef THC_CUDA_STORAGE_IMPLEMENT_COPY
 
diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h
index 3313290..c3e5601 100644
--- a/lib/THC/generic/THCStorageCopy.h
+++ b/lib/THC/generic/THCStorageCopy.h
@@ -21,7 +21,7 @@ THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, stru
 THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src);
 THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src);
 THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src);
-#if CUDA_VERSION >= 7050
+#ifdef CUDA_HALF_TENSOR
 THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src);
 #endif
 
diff --git a/lib/THC/generic/THCTensor.c b/lib/THC/generic/THCTensor.c
index 2f87f1a..e18044d 100644
--- a/lib/THC/generic/THCTensor.c
+++ b/lib/THC/generic/THCTensor.c
@@ -730,56 +730,56 @@ void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, lon
     self->nDimension = 0;
 }
 
-void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, hostreal value)
+void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, real value)
 {
   THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
   THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value);
 }
 
-hostreal THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0)
+real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0)
 {
   THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
   return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
 }
 
-void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, hostreal value)
+void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, real value)
 {
   THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
   THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value);
 }
 
-hostreal THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1)
+real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1)
 {
   THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
   return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]);
 }
 
-void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, hostreal value)
+void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, real value)
 {
   THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
   THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value);
 }
 
-hostreal THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2)
+real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2)
 {
   THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
   return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]);
 }
 
-void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, hostreal value)
+void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, real value)
 {
   THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
   THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value);
 }
 
-hostreal THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3)
+real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3)
 {
   THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
   THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
diff --git a/lib/THC/generic/THCTensor.h b/lib/THC/generic/THCTensor.h
index 8e4d1a4..175eaee 100644
--- a/lib/THC/generic/THCTensor.h
+++ b/lib/THC/generic/THCTensor.h
@@ -112,15 +112,15 @@ THC_API void THCTensor_(free)(THCState *state, THCTensor *self);
 THC_API void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst);
 
 /* Slow access methods [check everything] */
-THC_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, hostreal value);
-THC_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, hostreal value);
-THC_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, hostreal value);
-THC_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, hostreal value);
-
-THC_API hostreal THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0);
-THC_API hostreal THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1);
-THC_API hostreal THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2);
-THC_API hostreal THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3);
+THC_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, real value);
+THC_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, real value);
+THC_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, real value);
+THC_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, real value);
+
+THC_API real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0);
+THC_API real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1);
+THC_API real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2);
+THC_API real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3);
 
 /* CUDA-specific functions */
 THC_API cudaTextureObject_t THCTensor_(getTextureObject)(THCState *state, THCTensor *self);
diff --git a/lib/THC/generic/THCTensorCopy.c b/lib/THC/generic/THCTensorCopy.c
index 68f57bf..e0bcadd 100644
--- a/lib/THC/generic/THCTensorCopy.c
+++ b/lib/THC/generic/THCTensorCopy.c
@@ -126,7 +126,6 @@ IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Long)
 IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Float)
 IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Double)
 
-// FIXME: add within-CUDA conversions
 void THCTensor_(copyCuda)(THCState *state, THCTensor *self, THCTensor *src)
 {
   THCTensor_(copy)(state, self, src);
diff --git a/lib/THC/generic/THCTensorCopy.cu b/lib/THC/generic/THCTensorCopy.cu
index c5768e2..4198025 100644
--- a/lib/THC/generic/THCTensorCopy.cu
+++ b/lib/THC/generic/THCTensorCopy.cu
@@ -4,251 +4,44 @@
 
 THC_API void
 THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
-  long totalElements = THCTensor_(nElement)(state, dst);
-
-  THArgCheck(totalElements == THCTensor_(nElement)(state, src), 2,
-             "sizes do not match");
-
-  if (THCTensor_(nDimension)(state, dst) == 0) {
-    // Zero-dim tensor; copy nothing
-    return;
-  }
-
-  // We can memcpy the memory if:
-  // -both tensors are contiguous; or,
-  // -there is only one element to copy; or,
-  // -FIXME: if both tensors have matching size and stride arrays, and no
-  // holes within (in other words, there is some permutation that can be applied
-  // to the size/strides such that the resulting tensor is contiguous).
-  bool srcContig = THCTensor_(isContiguous)(state, src);
-  bool dstContig = THCTensor_(isContiguous)(state, dst);
-  bool memcpyEligible = (srcContig && dstContig) || (totalElements == 1);
-
-  int srcDev = THCTensor_(getDevice)(state, src);
-  int dstDev = THCTensor_(getDevice)(state, dst);
-  int oldDev = curGPU();
-
-  // We always perform the copy on the source device, using the
-  // current stream on the source device.
-  // If the copy is on the default stream, then we fully synchronize
-  // both src and dst's default streams for completion of the
-  // copy. We have to explicitly do this for non-contig copies.
-  // This mimics the behavior of cross-device cudaMemcpyAsync on
-  // the default stream.
-  // If the copy is not on the default stream, then it is up to the
-  // user to add needed synchronization on the dst device, since the
-  // stream on the dst device that wishes to synchronize may not be
-  // the same index as the one on the src device.
-  int copyStreamIndex =
-    THCState_getCurrentStreamIndex(state);
-  cudaStream_t copyStream =
-    THCState_getDeviceStream(state, srcDev, copyStreamIndex);
-
-  if (srcDev != dstDev && copyStreamIndex == 0) {
-    // This is a cross-device copy on the default stream. We perform a
-    // two-way barrier between both devices' default streams before
-    // the copy. This ensures that any write-after-write and
-    // write-after-read dependencies on the destination side are
-    // handled, so that no one is operating on the dst memory when
-    // we perform the copy.
-    // src waits on dst barrier (src already waits on src)
-    cudaEvent_t dstReady;
-    THCudaCheck(cudaSetDevice(dstDev));
-    THCudaCheck(cudaEventCreateWithFlags(&dstReady, cudaEventDisableTiming));
-    THCudaCheck(cudaEventRecord(dstReady, NULL));
-
-    THCudaCheck(cudaSetDevice(srcDev));
-    THCudaCheck(cudaStreamWaitEvent(NULL, dstReady, 0));
-    THCudaCheck(cudaEventDestroy(dstReady));
-  } else if (srcDev != oldDev) {
-    THCudaCheck(cudaSetDevice(srcDev));
-  }
-
-  // We are now on srcDev
-  if (memcpyEligible) {
-    // Perform the copy
-    THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, dst),
-                                THCTensor_(data)(state, src),
-                                totalElements * sizeof(real),
-                                cudaMemcpyDeviceToDevice,
-                                copyStream));
-  } else {
-#if defined(THC_REAL_IS_FLOAT)
-    // Non-contiguous copy
-
-    // We avoid creating temporary memory copies if possible.
-    // If both src and dst are on the same device, or if they are on
-    // different devices and p2p access is enabled, perform the copy
-    // by a pointwise copy kernel.
-    // Otherwise, we'll have to make contiguous (which will in fact
-    // invoke copy() again), and then perform the copy.
-    // FIXME: might want to consider only running the pointwise kernel
-    // if both src and dst innermost dimensions are contiguous. If
-    // they are not, then taking the hit of the memory allocation/free
-    // might be worth it to avoid non-coalesced reads or writes.
-
-    // A device always has access to itself, so this also handles the
-    // case srcDev == dstDev
-    if (THCState_getPeerToPeerAccess(state, srcDev, dstDev)) {
-      // Make sure we have the current stream set in THCState, since
-      // pointwise uses that
-      if (srcDev != oldDev) {
-        THCState_setStream(state, srcDev, copyStreamIndex);
-      }
-
-      bool succ =
-        THCudaTensor_pointwiseApply2(state, dst, src, CopyOp<float>());
-      THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
-
-      // Restore prior THCState stream
-      if (srcDev != oldDev) {
-        THCState_setStream(state, oldDev, copyStreamIndex);
-      }
-    } else {
-      // GPUs can't access each other directly; fall back to
-      // newContiguous and memcpy
-      THCudaTensor* srcContig = THCudaTensor_newContiguous(state, src);
-      THCudaTensor* dstContig = dst;
-
-      if (!THCudaTensor_isContiguous(state, dst)) {
-        // We are copying over the contents of dst, so we don't need
-        // to preserve its values. We just need a destination tensor
-        // the same size as dst.
-
-        // Allocate the tensor on the new device
-        THCudaCheck(cudaSetDevice(dstDev));
-
-        dstContig = THCudaTensor_new(state);
-        THCudaTensor_resizeAs(state, dstContig, dst);
-
-        THCudaCheck(cudaSetDevice(srcDev));
-      }
-
-      THCudaCheck(cudaMemcpyAsync(THCudaTensor_data(state, dstContig),
-                                  THCudaTensor_data(state, srcContig),
-                                  totalElements * sizeof(float),
-                                  cudaMemcpyDeviceToDevice,
-                                  copyStream));
-
-      THCudaTensor_free(state, srcContig);
-
-      if (dst != dstContig) {
-        THCudaTensor_freeCopyTo(state, dstContig, dst);
-      }
-    }
-#else
-#define STRINGIFY(x) #x
-      THError("Non-contiguous copy not implemented for Cuda%sTensor", STRINGIFY(Real));
-#undef STRINGIFY
-#endif
-  }
-
-  if (srcDev != dstDev && copyStreamIndex == 0) {
-    // dst waits on src barrier (dst already waits on dst). We cannot
-    // operate on dst's copy until the copy is complete.
-
-    // Still on srcDev, record default stream event
-    cudaEvent_t srcReady;
-    THCudaCheck(cudaEventCreateWithFlags(&srcReady, cudaEventDisableTiming));
-    THCudaCheck(cudaEventRecord(srcReady, NULL));
-
-    THCudaCheck(cudaSetDevice(dstDev));
-    THCudaCheck(cudaStreamWaitEvent(NULL, srcReady, 0));
-    THCudaCheck(cudaEventDestroy(srcReady));
-
-    // We are now on dstDev (right above). Restore prior device from dst
-    if (dstDev != oldDev) {
-      THCudaCheck(cudaSetDevice(oldDev));
-    }
-  } else {
-    // We are still on srcDev. Restore prior device from src
-    if (srcDev != oldDev) {
-      THCudaCheck(cudaSetDevice(oldDev));
-    }
-  }
+  THC_copyTensor<THCTensor, THCTensor>(state, dst, src);
+}
 
-  cudaError errcode = cudaGetLastError();
-  if (errcode != cudaSuccess) {
-    THError(cudaGetErrorString(errcode));
-  }
+THC_API void
+THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) {
+  // Called when we are copying into an overlapping index `dst`, but
+  // we don't care which writer wins. Hacky but it works.
+  // This is itself invoked by pointwiseApply2 / THCTensor_copy in
+  // case that there are write overlaps.
+  // FIXME: really, overlapping writes should be illegal/an error in Torch
+  THC_pointwiseApply2(
+    state, dst, src,
+    CopyOp<typename TensorUtils<THCTensor>::DataType,
+           typename TensorUtils<THCTensor>::DataType>(),
+    ReadOnly, /* ignore overwrites */
+    ReadOnly);
 }
 
-// conversions are mediated by the CPU
-// yes, this is slow; feel free to write CUDA kernels for this
-#ifndef THC_REAL_IS_HALF
-#define THC_CUDA_TENSOR_IMPLEMENT_COPY(TYPEC,TYPECUDA)                  \
-  void THCTensor_(copyCuda##TYPEC)(THCState *state, THCTensor *self, struct THCuda##TYPECUDA##Tensor *src)  \
-  {                                                                     \
-    if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) {                       \
-      THCTensor_(copy)(state, self, (THCTensor*) src);   /* cast just removes compiler warning */ \
-    } else {                                                            \
-      THArgCheck(THCTensor_(nElement)(state, self) == THCuda##TYPECUDA##Tensor_nElement(state, src), 2, "size does not match");    \
-      THLongStorage *size = THCuda##TYPECUDA##Tensor_newSizeOf(state, src);   \
-      TH##TYPEC##Tensor *buffer1 = TH##TYPEC##Tensor_newWithSize(size, NULL); \
-      THTensor *buffer2  = THTensor_(newWithSize)(size, NULL);          \
-      TH##TYPEC##Tensor_copyCuda(state, buffer1, src);                  \
-      THTensor_(copy##TYPEC)(buffer2, buffer1);                         \
-      THCTensor_(copyCPU)(state, self, buffer2);                        \
-      THLongStorage_free(size);                                         \
-      TH##TYPEC##Tensor_free(buffer1);                                  \
-      THTensor_(free)(buffer2);                                         \
-    }                                                                   \
-  }
-#else
-#define THC_CUDA_TENSOR_IMPLEMENT_COPY(TYPEC,TYPECUDA)                  \
-  void THCTensor_(copyCuda##TYPEC)(THCState *state, THCTensor *self, struct THCuda##TYPECUDA##Tensor *src)  \
-  {                                                                     \
-    THArgCheck(THCTensor_(nElement)(state, self) == THCuda##TYPECUDA##Tensor_nElement(state, src), 2, "size does not match");    \
-    if (THCTypeIdx_(TYPEC) == THCTypeIdxFloat) {                        \
-      THCudaTensor *csrc = THCudaTensor_newContiguous(state, (THCudaTensor*) src); /* cast removes compiler error */             \
-      THCFloat2Half(state,                                              \
-          THCTensor_(data)(state, self),                                \
-          THCudaTensor_data(state, csrc),                               \
-          THCudaTensor_nElement(state, csrc));                          \
-      THCudaTensor_free(state, csrc);                                   \
-    } else {                                                            \
-      THLongStorage *size = THCuda##TYPECUDA##Tensor_newSizeOf(state, src); \
-      THCudaTensor *buffer = THCudaTensor_newWithSize(state, size, NULL); \
-      THCudaTensor_copyCuda##TYPEC(state, buffer, src);                 \
-      THCFloat2Half(state,                                              \
-          THCTensor_(data)(state, self),                                \
-          THCudaTensor_data(state, buffer),                             \
-          THCudaTensor_nElement(state, buffer));                        \
-      THCudaTensor_free(state, buffer);                                 \
-      THLongStorage_free(size);                                         \
-    }                                                                   \
+#define IMPLEMENT_THC_CUDA_TENSOR_COPY(TYPEC, TYPECUDA)                 \
+  THC_API void                                                          \
+  THCTensor_(copyCuda##TYPEC)(THCState *state,                          \
+                              THCTensor *self,                          \
+                              THCuda##TYPECUDA##Tensor *src) {          \
+    THC_copyTensor<THCTensor, THCuda##TYPECUDA##Tensor>(state, self, src); \
   }
-#endif
-
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Byte,Byte)
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Char,Char)
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Short,Short)
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Int,Int)
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Long,Long)
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Float,)  // i.e. float
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Double,Double)
 
-#if CUDA_VERSION >= 7050
-#define FLOAT_COPY(TYPE) TH_CONCAT_3(TH, CReal, Tensor_copyCudaFloat)
-void THCTensor_(copyCudaHalf)(THCState *state, THCTensor *self, struct THCudaHalfTensor *src)
-{
-    if(THCTypeIdx_(Real) == THCTypeIdxHalf) {
-      THCTensor_(copy)(state, self, (THCTensor*) src);   /* cast removes compiler error */
-    } else {
-        THArgCheck(THCTensor_(nElement)(state, self) == THCudaHalfTensor_nElement(state, src), 2, "size does not match");
-        src = THCudaHalfTensor_newContiguous(state, src);
-        THLongStorage *size = THCudaHalfTensor_newSizeOf(state, src);
-        THCudaTensor *buffer = THCudaTensor_newWithSize(state, size, NULL);
-        THCHalf2Float(state, THCudaTensor_data(state, buffer), THCudaHalfTensor_data(state, src), THCudaHalfTensor_nElement(state, src));
-        FLOAT_COPY(Real)(state, self, buffer);
-        THCudaTensor_free(state, buffer);
-        THCudaHalfTensor_free(state, src);
-        THLongStorage_free(size);
-    }
-}
-#undef FLOAT_COPY
-#endif // CUDA_VERSION >= 7050
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Byte, Byte)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Char, Char)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Short, Short)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Int, Int)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Long, Long)
+// THCudaTensor aka the non-existent THCudaFloatTensor
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Float, )
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Double, Double)
+#ifdef CUDA_HALF_TENSOR
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half)
+#endif
 
-#undef THC_CUDA_TENSOR_IMPLEMENT_COPY
+#undef IMPLEMENT_THC_CUDA_TENSOR_COPY
 
 #endif
diff --git a/lib/THC/generic/THCTensorCopy.h b/lib/THC/generic/THCTensorCopy.h
index 8a2837f..3c7649b 100644
--- a/lib/THC/generic/THCTensorCopy.h
+++ b/lib/THC/generic/THCTensorCopy.h
@@ -3,6 +3,7 @@
 #else
 
 THC_API void THCTensor_(copy)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(copyIgnoringOverlaps)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(copyByte)(THCState *state, THCTensor *self, THByteTensor *src);
 THC_API void THCTensor_(copyChar)(THCState *state, THCTensor *self, THCharTensor *src);
 THC_API void THCTensor_(copyShort)(THCState *state, THCTensor *self, THShortTensor *src);
@@ -18,7 +19,7 @@ THC_API void THCTensor_(copyCudaInt)(THCState *state, THCTensor *storage, struct
 THC_API void THCTensor_(copyCudaLong)(THCState *state, THCTensor *storage, struct THCudaLongTensor *src);
 THC_API void THCTensor_(copyCudaFloat)(THCState *state, THCTensor *storage, struct THCudaTensor *src);
 THC_API void THCTensor_(copyCudaDouble)(THCState *state, THCTensor *storage, struct THCudaDoubleTensor *src);
-#if CUDA_VERSION >= 7050
+#ifdef CUDA_HALF_TENSOR
 THC_API void THCTensor_(copyCudaHalf)(THCState *state, THCTensor *storage, struct THCudaHalfTensor *src);
 #endif
 
diff --git a/lib/THC/generic/THCTensorMath.cu b/lib/THC/generic/THCTensorMath.cu
new file mode 100644
index 0000000..a0e550a
--- /dev/null
+++ b/lib/THC/generic/THCTensorMath.cu
@@ -0,0 +1,68 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMath.cu"
+#else
+
+THC_API void
+THCTensor_(fill)(THCState* state, THCTensor *self_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+
+  if (!THC_pointwiseApply1(
+        state, self_, TensorFillOp<real>(value))) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(zero)(THCState *state, THCTensor *self_)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  if (THCTensor_(isContiguous)(state, self_)) {
+    THCudaCheck(cudaMemsetAsync(THCTensor_(data)(state, self_),
+                                0,
+                                sizeof(real) * THCTensor_(nElement)(state, self_),
+                                THCState_getCurrentStream(state)));
+  } else {
+    if (!THC_pointwiseApply1(
+          state, self_,
+          TensorFillOp<real>(ScalarConvert<int, real>::to(0)))) {
+      THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCTensor_(resize)(state, r_, size, NULL);
+  THCTensor_(zero)(state, r_);
+}
+
+THC_API void
+THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCTensor_(resize)(state, r_, size, NULL);
+  THCTensor_(fill)(state, r_, ScalarConvert<int, real>::to(1));
+}
+
+THC_API void
+THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, r_, t));
+  THCTensor_(resize)(state, r_, size, NULL);
+  THCTensor_(copy)(state, r_, t);
+}
+
+long
+THCTensor_(numel)(THCState *state, THCTensor *t)
+{
+  return THCTensor_(nElement)(state, t);
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMath.h b/lib/THC/generic/THCTensorMath.h
new file mode 100644
index 0000000..5c9e66d
--- /dev/null
+++ b/lib/THC/generic/THCTensorMath.h
@@ -0,0 +1,13 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMath.h"
+#else
+
+THC_API void THCTensor_(fill)(THCState *state, THCTensor *self, real value);
+THC_API void THCTensor_(zero)(THCState *state, THCTensor *self);
+
+THC_API void THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size);
+THC_API void THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size);
+THC_API void THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size);
+THC_API long THCTensor_(numel)(THCState *state, THCTensor *t);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPairwise.cu b/lib/THC/generic/THCTensorMathPairwise.cu
new file mode 100644
index 0000000..4a5c09d
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPairwise.cu
@@ -0,0 +1,74 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.cu"
+#else
+
+THC_API void
+THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorAddConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorAddConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THCTensor_(add)(state, self_, src_, ScalarNegate<real>::to(value));
+}
+
+THC_API void
+THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_, TensorMulConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_, TensorMulConstantOp<real>(value))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+  THArgCheck(value != ScalarConvert<int, real>::to(0), 3, "divide by zero");
+
+  if (self_ == src_) {
+    if (!THC_pointwiseApply1(state, self_,
+                                 TensorMulConstantOp<real>(
+                                   ScalarInv<real>::to(value)))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src_);
+
+    if (!THC_pointwiseApply2(state, self_, src_,
+                                 TensorMulConstantOp<real>(
+                                   ScalarInv<real>::to(value)))) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPairwise.h b/lib/THC/generic/THCTensorMathPairwise.h
new file mode 100644
index 0000000..9a83293
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPairwise.h
@@ -0,0 +1,10 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.h"
+#else
+
+THC_API void THCTensor_(add)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(sub)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(div)(THCState *state, THCTensor *self, THCTensor *src, real value);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu
new file mode 100644
index 0000000..b6679cd
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPointwise.cu
@@ -0,0 +1,157 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.cu"
+#else
+
+THC_API void
+THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self += src2
+      if (!THC_pointwiseApply2(state, self_, src2, TensorAddOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self += value * src2
+      if (!THC_pointwiseApply2(state, self_, src2, TensorCAddOp<real>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self = src1 + src2
+      if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self = src1 + value * src2
+      if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCAddOp<real>(value))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self -= src2
+      if (!THC_pointwiseApply2(state, self_, src2, TensorSubOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self += -value * src2
+      if (!THC_pointwiseApply2(state, self_, src2,
+                                   TensorCAddOp<real>(
+                                     ScalarNegate<real>::to(value)))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    if (value == ScalarConvert<int, real>::to(1)) {
+      // self = src1 - src2
+      if (!THC_pointwiseApply3(state, self_, src1, src2, TensorSubOp<real>())) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    } else {
+      // self = src1 - value * src2
+      if (!THC_pointwiseApply3(state, self_, src1, src2,
+                                   TensorCAddOp<real>(
+                                     ScalarNegate<real>::to(value)))) {
+        THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+      }
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self *= src2
+    if (!THC_pointwiseApply2(state, self_, src2, TensorMulOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 * src2
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorMulOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self = pow(self, src2)
+    if (!THC_pointwiseApply2(state, self_, src2, TensorCPowOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = pow(src1, src2)
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCPowOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+  THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+  THArgCheck(THCTensor_(nElement)(state, src1) ==
+             THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+  if (self_ == src1) {
+    // self *= src2
+    if (!THC_pointwiseApply2(state, self_, src2, TensorDivOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  } else {
+    THCTensor_(resizeAs)(state, self_, src1);
+
+    // self = src1 * src2
+    if (!THC_pointwiseApply3(state, self_, src1, src2, TensorDivOp<real>())) {
+      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+    }
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPointwise.h b/lib/THC/generic/THCTensorMathPointwise.h
new file mode 100644
index 0000000..cfb3b14
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPointwise.h
@@ -0,0 +1,11 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.h"
+#else
+
+THC_API void THCTensor_(cadd)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2);
+THC_API void THCTensor_(csub)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2);
+THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cpow)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+
+#endif