Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/cutorch.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/lib/THC
diff options
context:
space:
mode:
Diffstat (limited to 'lib/THC')
-rw-r--r--lib/THC/CMakeLists.txt17
-rw-r--r--lib/THC/THCApply.cu10
-rw-r--r--lib/THC/THCApply.cuh566
-rw-r--r--lib/THC/THCGenerateAllTypes.h21
-rw-r--r--lib/THC/THCHalf.cu21
-rw-r--r--lib/THC/THCHalf.h12
-rw-r--r--lib/THC/THCReduce.cuh150
-rw-r--r--lib/THC/THCReduceAll.cuh112
-rw-r--r--lib/THC/THCReduceApplyUtils.cu93
-rw-r--r--lib/THC/THCReduceApplyUtils.cuh283
-rw-r--r--lib/THC/THCSortUtils.cuh9
-rw-r--r--lib/THC/THCStorageCopy.h1
-rw-r--r--lib/THC/THCTensorCopy.cu221
-rw-r--r--lib/THC/THCTensorCopy.h1
-rw-r--r--lib/THC/THCTensorIndex.cu300
-rw-r--r--lib/THC/THCTensorInfo.cuh279
-rw-r--r--lib/THC/THCTensorMasked.cu6
-rw-r--r--lib/THC/THCTensorMath.cu143
-rw-r--r--lib/THC/THCTensorMath.h23
-rw-r--r--lib/THC/THCTensorMath2.cu22
-rw-r--r--lib/THC/THCTensorMathCompare.cu2
-rw-r--r--lib/THC/THCTensorMathCompareT.cu2
-rw-r--r--lib/THC/THCTensorMathPairwise.cu133
-rw-r--r--lib/THC/THCTensorMathPointwise.cu429
-rw-r--r--lib/THC/THCTensorScatterGather.cu125
-rw-r--r--lib/THC/THCTensorSort.cu39
-rw-r--r--lib/THC/THCTensorTopK.cu108
-rw-r--r--lib/THC/THCTensorTypeUtils.cu210
-rw-r--r--lib/THC/THCTensorTypeUtils.cuh184
-rw-r--r--lib/THC/generic/THCStorage.c25
-rw-r--r--lib/THC/generic/THCStorage.cu3
-rw-r--r--lib/THC/generic/THCStorage.h14
-rw-r--r--lib/THC/generic/THCStorageCopy.cu4
-rw-r--r--lib/THC/generic/THCStorageCopy.h2
-rw-r--r--lib/THC/generic/THCTensor.c16
-rw-r--r--lib/THC/generic/THCTensor.h18
-rw-r--r--lib/THC/generic/THCTensorCopy.c1
-rw-r--r--lib/THC/generic/THCTensorCopy.cu273
-rw-r--r--lib/THC/generic/THCTensorCopy.h3
-rw-r--r--lib/THC/generic/THCTensorMath.cu68
-rw-r--r--lib/THC/generic/THCTensorMath.h13
-rw-r--r--lib/THC/generic/THCTensorMathPairwise.cu74
-rw-r--r--lib/THC/generic/THCTensorMathPairwise.h10
-rw-r--r--lib/THC/generic/THCTensorMathPointwise.cu157
-rw-r--r--lib/THC/generic/THCTensorMathPointwise.h11
45 files changed, 2479 insertions, 1735 deletions
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index 3a035de..dfc7ec3 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -124,16 +124,21 @@ SET(src-cuda
THCTensorConv.cu
THCTensorRandom.cu
THCTensorScatterGather.cu
- THCApply.cu
THCTensorSort.cu
THCTensorTopK.cu
+ THCTensorTypeUtils.cu
)
+MESSAGE(STATUS "got cuda version " ${CUDA_VERSION})
+
IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+ MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor")
LIST(APPEND src-cuda THCHalf.cu)
LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1")
SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}")
-ENDIF()
+ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
+ MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
+ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda})
CUDA_ADD_CUBLAS_TO_TARGET(THC)
@@ -176,6 +181,8 @@ INSTALL(FILES
THCDeviceTensorUtils-inl.cuh
THCGenerateAllTypes.h
THCHalf.h
+ THCTensorInfo.cuh
+ THCTensorTypeUtils.cuh
DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC")
INSTALL(FILES
@@ -191,4 +198,10 @@ INSTALL(FILES
generic/THCTensorCopy.c
generic/THCTensorCopy.cu
generic/THCTensorCopy.h
+ generic/THCTensorMath.h
+ generic/THCTensorMath.cu
+ generic/THCTensorMathPairwise.h
+ generic/THCTensorMathPairwise.cu
+ generic/THCTensorMathPointwise.h
+ generic/THCTensorMathPointwise.cu
DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/lib/THC/THCApply.cu b/lib/THC/THCApply.cu
deleted file mode 100644
index 3ee9d51..0000000
--- a/lib/THC/THCApply.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "THCApply.cuh"
-
-// Implementation of copyIgnoringOverlaps, defined after pointwiseApply2.
-void THCudaTensor_copyIgnoringOverlaps(THCState* state,
- THCudaTensor* dst,
- THCudaTensor* src) {
- THCudaTensor_pointwiseApply2(state, dst, src, CopyOp<float>(),
- ReadOnly, // ignore overwrites
- ReadOnly);
-}
diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh
index 707e22f..dd6d32a 100644
--- a/lib/THC/THCApply.cuh
+++ b/lib/THC/THCApply.cuh
@@ -3,6 +3,7 @@
#include "THCTensorCopy.h"
#include "THCReduceApplyUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
//
// This file contains pointwise operation functions and kernels that
@@ -12,81 +13,85 @@
//
// Threads per block for our apply kernel
+// FIXME: use occupancy calculator instead
#define THC_APPLY_THREADS_PER_BLOCK 32 * 16
-// Called when we are copying into an overlapping index `dst`, but
-// we don't care which writer wins. Hacky but it works.
-THC_API void THCudaTensor_copyIgnoringOverlaps(THCState* state,
- THCudaTensor* dst,
- THCudaTensor* src);
-
-template <typename Op, typename IndexType, int ADims>
+template <typename Op,
+ typename Ta,
+ typename IndexType,
+ int ADims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
#endif
__global__ void
-THCudaTensor_pointwiseApply1(TensorInfo<IndexType> a,
- IndexType totalElements,
- Op op) {
+kernelPointwiseApply1(TensorInfo<Ta, IndexType> a,
+ IndexType totalElements,
+ Op op) {
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < totalElements;
linearIndex += gridDim.x * blockDim.x) {
// Convert `linearIndex` into an offset of `a`
const IndexType aOffset =
- IndexToOffset<IndexType, ADims>::get(linearIndex, a);
+ IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);
op(&a.data[aOffset]);
}
}
-template <typename Op, typename IndexType, int ADims, int BDims>
+template <typename Op,
+ typename Ta, typename Tb,
+ typename IndexType,
+ int ADims, int BDims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
#endif
__global__ void
-THCudaTensor_pointwiseApply2(TensorInfo<IndexType> a,
- TensorInfo<IndexType> b,
- IndexType totalElements,
- Op op) {
+kernelPointwiseApply2(TensorInfo<Ta, IndexType> a,
+ TensorInfo<Tb, IndexType> b,
+ IndexType totalElements,
+ Op op) {
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < totalElements;
linearIndex += gridDim.x * blockDim.x) {
// Convert `linearIndex` into an offset of `a`
const IndexType aOffset =
- IndexToOffset<IndexType, ADims>::get(linearIndex, a);
+ IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);
// Convert `linearIndex` into an offset of `b`
const IndexType bOffset =
- IndexToOffset<IndexType, BDims>::get(linearIndex, b);
+ IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b);
op(&a.data[aOffset], &b.data[bOffset]);
}
}
-template <typename Op, typename IndexType, int ADims, int BDims, int CDims>
+template <typename Op,
+ typename Ta, typename Tb, typename Tc,
+ typename IndexType,
+ int ADims, int BDims, int CDims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
#endif
__global__ void
-THCudaTensor_pointwiseApply3(TensorInfo<IndexType> a,
- TensorInfo<IndexType> b,
- TensorInfo<IndexType> c,
- IndexType totalElements,
- Op op) {
+kernelPointwiseApply3(TensorInfo<Ta, IndexType> a,
+ TensorInfo<Tb, IndexType> b,
+ TensorInfo<Tc, IndexType> c,
+ IndexType totalElements,
+ Op op) {
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
linearIndex < totalElements;
linearIndex += gridDim.x * blockDim.x) {
// Convert `linearIndex` into an offset of `a`
const IndexType aOffset =
- IndexToOffset<IndexType, ADims>::get(linearIndex, a);
+ IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a);
// Convert `linearIndex` into an offset of `b`
const IndexType bOffset =
- IndexToOffset<IndexType, BDims>::get(linearIndex, b);
+ IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b);
// Convert `linearIndex` into an offset of `c`
const IndexType cOffset =
- IndexToOffset<IndexType, CDims>::get(linearIndex, c);
+ IndexToOffset<Tc, IndexType, CDims>::get(linearIndex, c);
op(&a.data[aOffset], &b.data[bOffset], &c.data[cOffset]);
}
@@ -116,18 +121,17 @@ inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) {
return true;
}
-template <typename Op>
-bool THCudaTensor_pointwiseApply1(THCState* state,
- THCudaTensor* a,
- const Op& op,
- TensorArgType aType = ReadWrite) {
- long totalElements = THCudaTensor_nElement(state, a);
-
- if (THCudaTensor_nDimension(state, a) > MAX_CUTORCH_DIMS) {
+template <typename TensorTypeA,
+ typename Op>
+bool THC_pointwiseApply1(THCState* state,
+ TensorTypeA* a,
+ const Op& op,
+ TensorArgType aType = ReadWrite) {
+ if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS) {
return false;
}
- if (THCudaTensor_nDimension(state, a) == 0) {
+ if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
// Zero-dim tensor; do nothing
return true;
}
@@ -135,6 +139,8 @@ bool THCudaTensor_pointwiseApply1(THCState* state,
const dim3 block = getApplyBlock();
dim3 grid;
+ long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+
if (!getApplyGrid(state, totalElements, grid)) {
return false;
}
@@ -148,12 +154,13 @@ bool THCudaTensor_pointwiseApply1(THCState* state,
// indices of a tensor with overlapping indices should probably be
// an error, since it is unclear which one should win), but we will
// preserve this last-writer-wins (in arbitrary copy order) behavior.
- THCudaTensor* oldA = NULL;
+ TensorTypeA* oldA = NULL;
- if (aType == ReadWrite && THC_overlappingIndices(state, a)) {
+ if (aType == ReadWrite &&
+ TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
// Must perform in contiguous space
oldA = a;
- a = THCudaTensor_newContiguous(state, a);
+ a = TensorUtils<TensorTypeA>::newContiguous(state, a);
}
// It is possible that the tensor dimensions are able to be collapsed,
@@ -164,55 +171,60 @@ bool THCudaTensor_pointwiseApply1(THCState* state,
// (or vice versa), the contiguous tensor can be collapsed to one
// dimension, and the loop to translate the linear index to the array
// index can be similarly collapsed. That is what this unrolling is for.
-#define HANDLE_CASE(TYPE, A) \
- THCudaTensor_pointwiseApply1<Op, TYPE, A> \
- <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
+#define HANDLE_CASE(TYPE, A) \
+ kernelPointwiseApply1<Op, \
+ typename TensorUtils<TensorTypeA>::DataType, \
+ TYPE, A> \
+ <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
aInfo, (TYPE) totalElements, op);
-#define HANDLE_A_CASE(TYPE, A) \
- { \
- if (aInfo.isContiguous()) { \
- HANDLE_CASE(TYPE, -2); \
- } else { \
- switch (A) { \
- case 1: \
- HANDLE_CASE(TYPE, 1); \
- break; \
- case 2: \
- HANDLE_CASE(TYPE, 2); \
- break; \
- case 3: \
- HANDLE_CASE(TYPE, 3); \
- break; \
- default: \
- HANDLE_CASE(TYPE, -1); \
- break; \
- } \
- } \
+#define HANDLE_A_CASE(TYPE, A) \
+ { \
+ if (aInfo.isContiguous()) { \
+ HANDLE_CASE(TYPE, -2); \
+ } else { \
+ switch (A) { \
+ case 1: \
+ HANDLE_CASE(TYPE, 1); \
+ break; \
+ case 2: \
+ HANDLE_CASE(TYPE, 2); \
+ break; \
+ default: \
+ HANDLE_CASE(TYPE, -1); \
+ break; \
+ } \
+ } \
}
// Can we use 32-bit integer math in the kernel (the linear ID for the copy
// and the resulting non-linear offset is all computable using 32-bit math?)
// We also use unsigned index math in the kernel, as signed div/mod has
// additional overhead.
- if (THC_canUse32BitIndexMath(state, a)) {
- TensorInfo<unsigned int> aInfo(state, a);
+ if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a)) {
+ TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
+ getTensorInfo<TensorTypeA, unsigned int>(state, a);
aInfo.collapseDims();
HANDLE_A_CASE(unsigned int, aInfo.dims);
} else {
- TensorInfo<unsigned long> aInfo(state, a);
+ TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo =
+ getTensorInfo<TensorTypeA, unsigned long>(state, a);
aInfo.collapseDims();
// For large tensors, we only compile the completely contiguous
// version and the completely generic version, to reduce
// compilation time.
if (aInfo.isContiguous()) {
- THCudaTensor_pointwiseApply1<Op, unsigned long, -2>
+ kernelPointwiseApply1<Op,
+ typename TensorUtils<TensorTypeA>::DataType,
+ unsigned long, -2>
<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
aInfo, (unsigned long) totalElements, op);
} else {
- THCudaTensor_pointwiseApply1<Op, unsigned long, -1>
+ kernelPointwiseApply1<Op,
+ typename TensorUtils<TensorTypeA>::DataType,
+ unsigned long, -1>
<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
aInfo, (unsigned long) totalElements, op);
}
@@ -221,36 +233,38 @@ bool THCudaTensor_pointwiseApply1(THCState* state,
#undef HANDLE_A_CASE
if (oldA) {
- // Ignore overlaps when copying back; if we use THCudaTensor_copy
+ // Ignore overlaps when copying back; if we use THCTensor_copy
// instead, it will recursively try and invoke ourselves to make
// oldA contiguous.
- THCudaTensor_copyIgnoringOverlaps(state, oldA, a);
- THCudaTensor_free(state, a);
+ TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
+ TensorUtils<TensorTypeA>::free(state, a);
a = oldA;
}
return true;
}
-template <typename Op>
-bool THCudaTensor_pointwiseApply2(THCState* state,
- THCudaTensor* a,
- THCudaTensor* b,
- const Op& op,
- TensorArgType aType = ReadWrite,
- TensorArgType bType = ReadOnly) {
- long totalElements = THCudaTensor_nElement(state, a);
-
- if (totalElements != THCudaTensor_nElement(state, b)) {
+template <typename TensorTypeA,
+ typename TensorTypeB,
+ typename Op>
+bool THC_pointwiseApply2(THCState* state,
+ TensorTypeA* a,
+ TensorTypeB* b,
+ const Op& op,
+ TensorArgType aType = ReadWrite,
+ TensorArgType bType = ReadOnly) {
+ long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+
+ if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) {
return false;
}
- if (THCudaTensor_nDimension(state, a) > MAX_CUTORCH_DIMS ||
- THCudaTensor_nDimension(state, b) > MAX_CUTORCH_DIMS) {
+ if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS ||
+ TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS) {
return false;
}
- if (THCudaTensor_nDimension(state, a) == 0) {
+ if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
// Zero-dim tensor; do nothing
return true;
}
@@ -271,18 +285,20 @@ bool THCudaTensor_pointwiseApply2(THCState* state,
// indices of a tensor with overlapping indices should probably be
// an error, since it is unclear which one should win), but we will
// preserve this last-writer-wins (in arbitrary copy order) behavior.
- THCudaTensor* oldA = NULL;
- THCudaTensor* oldB = NULL;
+ TensorTypeA* oldA = NULL;
+ TensorTypeB* oldB = NULL;
- if (aType == ReadWrite && THC_overlappingIndices(state, a)) {
+ if (aType == ReadWrite &&
+ TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
// Must perform in contiguous space
oldA = a;
- a = THCudaTensor_newContiguous(state, a);
+ a = TensorUtils<TensorTypeA>::newContiguous(state, a);
}
- if (bType == ReadWrite && THC_overlappingIndices(state, b)) {
+ if (bType == ReadWrite &&
+ TensorUtils<TensorTypeB>::overlappingIndices(state, b)) {
// Must perform in contiguous space
oldB = b;
- b = THCudaTensor_newContiguous(state, b);
+ b = TensorUtils<TensorTypeB>::newContiguous(state, b);
}
// It is possible that the tensor dimensions are able to be collapsed,
@@ -293,80 +309,87 @@ bool THCudaTensor_pointwiseApply2(THCState* state,
// (or vice versa), the contiguous tensor can be collapsed to one
// dimension, and the loop to translate the linear index to the array
// index can be similarly collapsed. That is what this unrolling is for.
-#define HANDLE_CASE(TYPE, A, B) \
- THCudaTensor_pointwiseApply2<Op, TYPE, A, B> \
- <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
+#define HANDLE_CASE(TYPE, A, B) \
+ kernelPointwiseApply2<Op, \
+ typename TensorUtils<TensorTypeA>::DataType, \
+ typename TensorUtils<TensorTypeB>::DataType, \
+ TYPE, A, B> \
+ <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
aInfo, bInfo, (TYPE) totalElements, op);
-#define HANDLE_B_CASE(TYPE, A, B) \
- { \
- if (bInfo.isContiguous()) { \
- HANDLE_CASE(TYPE, A, -2); \
- } else { \
- switch (B) { \
- case 1: \
- HANDLE_CASE(TYPE, A, 1); \
- break; \
- case 2: \
- HANDLE_CASE(TYPE, A, 2); \
- break; \
- case 3: \
- HANDLE_CASE(TYPE, A, 3); \
- break; \
- default: \
- HANDLE_CASE(TYPE, A, -1); \
- break; \
- } \
- } \
- }
-
-#define HANDLE_A_CASE(TYPE, A, B) \
- { \
- if (aInfo.isContiguous()) { \
- HANDLE_B_CASE(TYPE, -2, B); \
- } else { \
- switch (A) { \
- case 1: \
- HANDLE_B_CASE(TYPE, 1, B); \
- break; \
- case 2: \
- HANDLE_B_CASE(TYPE, 2, B); \
- break; \
- case 3: \
- HANDLE_B_CASE(TYPE, 3, B); \
- break; \
- default: \
- HANDLE_B_CASE(TYPE, -1, B); \
- break; \
- } \
- } \
- }
-
- if (THC_canUse32BitIndexMath(state, a) &&
- THC_canUse32BitIndexMath(state, b)) {
- TensorInfo<unsigned int> aInfo(state, a);
+#define HANDLE_B_CASE(TYPE, A, B) \
+ { \
+ if (bInfo.isContiguous()) { \
+ HANDLE_CASE(TYPE, A, -2); \
+ } else { \
+ switch (B) { \
+ case 1: \
+ HANDLE_CASE(TYPE, A, 1); \
+ break; \
+ case 2: \
+ HANDLE_CASE(TYPE, A, 2); \
+ break; \
+ default: \
+ HANDLE_CASE(TYPE, A, -1); \
+ break; \
+ } \
+ } \
+ }
+
+#define HANDLE_A_CASE(TYPE, A, B) \
+ { \
+ if (aInfo.isContiguous()) { \
+ HANDLE_B_CASE(TYPE, -2, B); \
+ } else { \
+ switch (A) { \
+ case 1: \
+ HANDLE_B_CASE(TYPE, 1, B); \
+ break; \
+ case 2: \
+ HANDLE_B_CASE(TYPE, 2, B); \
+ break; \
+ default: \
+ HANDLE_B_CASE(TYPE, -1, B); \
+ break; \
+ } \
+ } \
+ }
+
+ if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
+ TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b)) {
+ TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
+ getTensorInfo<TensorTypeA, unsigned int>(state, a);
aInfo.collapseDims();
- TensorInfo<unsigned int> bInfo(state, b);
+ TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo =
+ getTensorInfo<TensorTypeB, unsigned int>(state, b);
bInfo.collapseDims();
HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
} else {
- TensorInfo<unsigned long> aInfo(state, a);
+ TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo =
+ getTensorInfo<TensorTypeA, unsigned long>(state, a);
aInfo.collapseDims();
- TensorInfo<unsigned long> bInfo(state, b);
+ TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned long> bInfo =
+ getTensorInfo<TensorTypeB, unsigned long>(state, b);
bInfo.collapseDims();
// For large tensors, we only compile the completely contiguous
// version and the completely generic version, to reduce
// compilation time.
if (aInfo.isContiguous() && bInfo.isContiguous()) {
- THCudaTensor_pointwiseApply2<Op, unsigned long, -2, -2>
+ kernelPointwiseApply2<Op,
+ typename TensorUtils<TensorTypeA>::DataType,
+ typename TensorUtils<TensorTypeB>::DataType,
+ unsigned long, -2, -2>
<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
aInfo, bInfo, (unsigned long) totalElements, op);
} else {
- THCudaTensor_pointwiseApply2<Op, unsigned long, -1, -1>
+ kernelPointwiseApply2<Op,
+ typename TensorUtils<TensorTypeA>::DataType,
+ typename TensorUtils<TensorTypeB>::DataType,
+ unsigned long, -1, -1>
<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
aInfo, bInfo, (unsigned long) totalElements, op);
}
@@ -376,49 +399,52 @@ bool THCudaTensor_pointwiseApply2(THCState* state,
#undef HANDLE_A_CASE
if (oldA) {
- // Ignore overlaps when copying back; if we use THCudaTensor_copy
+ // Ignore overlaps when copying back; if we use THCTensor_copy
// instead, it will recursively try and invoke ourselves to make
// oldA contiguous.
- THCudaTensor_copyIgnoringOverlaps(state, oldA, a);
- THCudaTensor_free(state, a);
+ TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
+ TensorUtils<TensorTypeA>::free(state, a);
a = oldA;
}
if (oldB) {
- // Ignore overlaps when copying back; if we use THCudaTensor_copy
+ // Ignore overlaps when copying back; if we use THCTensor_copy
// instead, it will recursively try and invoke ourselves to make
// oldB contiguous.
- THCudaTensor_copyIgnoringOverlaps(state, oldB, b);
- THCudaTensor_free(state, b);
+ TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
+ TensorUtils<TensorTypeB>::free(state, b);
b = oldB;
}
return true;
}
-template <typename Op>
-bool THCudaTensor_pointwiseApply3(THCState* state,
- THCudaTensor* a,
- THCudaTensor* b,
- THCudaTensor* c,
- const Op& op,
- TensorArgType aType = ReadWrite,
- TensorArgType bType = ReadOnly,
- TensorArgType cType = ReadOnly) {
- long totalElements = THCudaTensor_nElement(state, a);
-
- if (totalElements != THCudaTensor_nElement(state, b) ||
- totalElements != THCudaTensor_nElement(state, c)) {
+template <typename TensorTypeA,
+ typename TensorTypeB,
+ typename TensorTypeC,
+ typename Op>
+bool THC_pointwiseApply3(THCState* state,
+ TensorTypeA* a,
+ TensorTypeB* b,
+ TensorTypeC* c,
+ const Op& op,
+ TensorArgType aType = ReadWrite,
+ TensorArgType bType = ReadOnly,
+ TensorArgType cType = ReadOnly) {
+ long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+
+ if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) ||
+ totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) {
return false;
}
- if (THCudaTensor_nDimension(state, a) > MAX_CUTORCH_DIMS ||
- THCudaTensor_nDimension(state, b) > MAX_CUTORCH_DIMS ||
- THCudaTensor_nDimension(state, c) > MAX_CUTORCH_DIMS) {
+ if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS ||
+ TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS ||
+ TensorUtils<TensorTypeC>::getDims(state, c) > MAX_CUTORCH_DIMS) {
return false;
}
- if (THCudaTensor_nDimension(state, a) == 0) {
+ if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) {
// Zero-dim tensor; do nothing
return true;
}
@@ -439,131 +465,141 @@ bool THCudaTensor_pointwiseApply3(THCState* state,
// indices of a tensor with overlapping indices should probably be
// an error, since it is unclear which one should win), but we will
// preserve this last-writer-wins (in arbitrary copy order) behavior.
- THCudaTensor* oldA = NULL;
- THCudaTensor* oldB = NULL;
- THCudaTensor* oldC = NULL;
+ TensorTypeA* oldA = NULL;
+ TensorTypeB* oldB = NULL;
+ TensorTypeC* oldC = NULL;
- if (aType == ReadWrite && THC_overlappingIndices(state, a)) {
+ if (aType == ReadWrite &&
+ TensorUtils<TensorTypeA>::overlappingIndices(state, a)) {
// Must perform in contiguous space
oldA = a;
- a = THCudaTensor_newContiguous(state, a);
+ a = TensorUtils<TensorTypeA>::newContiguous(state, a);
}
-
- if (bType == ReadWrite && THC_overlappingIndices(state, b)) {
+ if (bType == ReadWrite &&
+ TensorUtils<TensorTypeB>::overlappingIndices(state, b)) {
// Must perform in contiguous space
oldB = b;
- b = THCudaTensor_newContiguous(state, b);
+ b = TensorUtils<TensorTypeB>::newContiguous(state, b);
}
-
- if (cType == ReadWrite && THC_overlappingIndices(state, c)) {
+ if (cType == ReadWrite &&
+ TensorUtils<TensorTypeC>::overlappingIndices(state, c)) {
// Must perform in contiguous space
oldC = c;
- c = THCudaTensor_newContiguous(state, c);
+ c = TensorUtils<TensorTypeC>::newContiguous(state, c);
}
#define HANDLE_CASE(TYPE, A, B, C) \
- THCudaTensor_pointwiseApply3<Op, TYPE, A, B, C> \
+ kernelPointwiseApply3<Op, \
+ typename TensorUtils<TensorTypeA>::DataType, \
+ typename TensorUtils<TensorTypeB>::DataType, \
+ typename TensorUtils<TensorTypeC>::DataType, \
+ TYPE, A, B, C> \
<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
aInfo, bInfo, cInfo, (TYPE) totalElements, op);
-#define HANDLE_C_CASE(TYPE, A, B, C) \
- { \
- if (cInfo.isContiguous()) { \
- HANDLE_CASE(TYPE, A, B, -2); \
- } else { \
- switch (C) { \
- case 1: \
- HANDLE_CASE(TYPE, A, B, 1); \
- break; \
- case 2: \
- HANDLE_CASE(TYPE, A, B, 2); \
- break; \
- case 3: \
- HANDLE_CASE(TYPE, A, B, 3); \
- break; \
- default: \
- HANDLE_CASE(TYPE, A, B, -1); \
- break; \
- } \
- } \
- }
-
-#define HANDLE_B_CASE(TYPE, A, B, C) \
- { \
- if (bInfo.isContiguous()) { \
- HANDLE_C_CASE(TYPE, A, -2, C); \
- } else { \
- switch (B) { \
- case 1: \
- HANDLE_C_CASE(TYPE, A, 1, C); \
- break; \
- case 2: \
- HANDLE_C_CASE(TYPE, A, 2, C); \
- break; \
- case 3: \
- HANDLE_C_CASE(TYPE, A, 3, C); \
- break; \
- default: \
- HANDLE_C_CASE(TYPE, A, -1, C); \
- break; \
- } \
- } \
- }
-
-#define HANDLE_A_CASE(TYPE, A, B, C) \
- { \
- if (aInfo.isContiguous()) { \
- HANDLE_B_CASE(TYPE, -2, B, C); \
- } else { \
- switch (A) { \
- case 1: \
- HANDLE_B_CASE(TYPE, 1, B, C); \
- break; \
- case 2: \
- HANDLE_B_CASE(TYPE, 2, B, C); \
- break; \
- case 3: \
- HANDLE_B_CASE(TYPE, 3, B, C); \
- break; \
- default: \
- HANDLE_B_CASE(TYPE, -1, B, C); \
- break; \
- } \
- } \
- }
-
- if (THC_canUse32BitIndexMath(state, a) &&
- THC_canUse32BitIndexMath(state, b) &&
- THC_canUse32BitIndexMath(state, c)) {
- TensorInfo<unsigned int> aInfo(state, a);
+#define HANDLE_C_CASE(TYPE, A, B, C) \
+ { \
+ if (cInfo.isContiguous()) { \
+ HANDLE_CASE(TYPE, A, B, -2); \
+ } else { \
+ switch (C) { \
+ case 1: \
+ HANDLE_CASE(TYPE, A, B, 1); \
+ break; \
+ case 2: \
+ HANDLE_CASE(TYPE, A, B, 2); \
+ break; \
+ default: \
+ HANDLE_CASE(TYPE, A, B, -1); \
+ break; \
+ } \
+ } \
+ }
+
+#define HANDLE_B_CASE(TYPE, A, B, C) \
+ { \
+ if (bInfo.isContiguous()) { \
+ HANDLE_C_CASE(TYPE, A, -2, C); \
+ } else { \
+ switch (B) { \
+ case 1: \
+ HANDLE_C_CASE(TYPE, A, 1, C); \
+ break; \
+ case 2: \
+ HANDLE_C_CASE(TYPE, A, 2, C); \
+ break; \
+ default: \
+ HANDLE_C_CASE(TYPE, A, -1, C); \
+ break; \
+ } \
+ } \
+ }
+
+#define HANDLE_A_CASE(TYPE, A, B, C) \
+ { \
+ if (aInfo.isContiguous()) { \
+ HANDLE_B_CASE(TYPE, -2, B, C); \
+ } else { \
+ switch (A) { \
+ case 1: \
+ HANDLE_B_CASE(TYPE, 1, B, C); \
+ break; \
+ case 2: \
+ HANDLE_B_CASE(TYPE, 2, B, C); \
+ break; \
+ default: \
+ HANDLE_B_CASE(TYPE, -1, B, C); \
+ break; \
+ } \
+ } \
+ }
+
+ if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) &&
+ TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b) &&
+ TensorUtils<TensorTypeC>::canUse32BitIndexMath(state, c)) {
+ TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo =
+ getTensorInfo<TensorTypeA, unsigned int>(state, a);
aInfo.collapseDims();
- TensorInfo<unsigned int> bInfo(state, b);
+ TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo =
+ getTensorInfo<TensorTypeB, unsigned int>(state, b);
bInfo.collapseDims();
- TensorInfo<unsigned int> cInfo(state, c);
+ TensorInfo<typename TensorUtils<TensorTypeC>::DataType, unsigned int> cInfo =
+ getTensorInfo<TensorTypeC, unsigned int>(state, c);
cInfo.collapseDims();
HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims);
} else {
- TensorInfo<unsigned long> aInfo(state, a);
+ TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo =
+ getTensorInfo<TensorTypeA, unsigned long>(state, a);
aInfo.collapseDims();
- TensorInfo<unsigned long> bInfo(state, b);
+ TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned long> bInfo =
+ getTensorInfo<TensorTypeB, unsigned long>(state, b);
bInfo.collapseDims();
- TensorInfo<unsigned long> cInfo(state, c);
+ TensorInfo<typename TensorUtils<TensorTypeC>::DataType, unsigned long> cInfo =
+ getTensorInfo<TensorTypeC, unsigned long>(state, c);
cInfo.collapseDims();
// For large tensors, we only compile the completely contiguous
// version and the completely generic version, to reduce
// compilation time.
if (aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()) {
- THCudaTensor_pointwiseApply3<Op, unsigned long, -2, -2, -2>
+ kernelPointwiseApply3<Op,
+ typename TensorUtils<TensorTypeA>::DataType,
+ typename TensorUtils<TensorTypeB>::DataType,
+ typename TensorUtils<TensorTypeC>::DataType,
+ unsigned long, -2, -2, -2>
<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
aInfo, bInfo, cInfo, (unsigned long) totalElements, op);
} else {
- THCudaTensor_pointwiseApply3<Op, unsigned long, -1, -1, -1>
+ kernelPointwiseApply3<Op,
+ typename TensorUtils<TensorTypeA>::DataType,
+ typename TensorUtils<TensorTypeB>::DataType,
+ typename TensorUtils<TensorTypeC>::DataType,
+ unsigned long, -1, -1, -1>
<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
aInfo, bInfo, cInfo, (unsigned long) totalElements, op);
}
@@ -574,29 +610,29 @@ bool THCudaTensor_pointwiseApply3(THCState* state,
#undef HANDLE_A_CASE
if (oldA) {
- // Ignore overlaps when copying back; if we use THCudaTensor_copy
+ // Ignore overlaps when copying back; if we use THCTensor_copy
// instead, it will recursively try and invoke ourselves to make
// oldA contiguous.
- THCudaTensor_copyIgnoringOverlaps(state, oldA, a);
- THCudaTensor_free(state, a);
+ TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a);
+ TensorUtils<TensorTypeA>::free(state, a);
a = oldA;
}
if (oldB) {
- // Ignore overlaps when copying back; if we use THCudaTensor_copy
+ // Ignore overlaps when copying back; if we use THCTensor_copy
// instead, it will recursively try and invoke ourselves to make
// oldB contiguous.
- THCudaTensor_copyIgnoringOverlaps(state, oldB, b);
- THCudaTensor_free(state, b);
+ TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b);
+ TensorUtils<TensorTypeB>::free(state, b);
b = oldB;
}
if (oldC) {
- // Ignore overlaps when copying back; if we use THCudaTensor_copy
+ // Ignore overlaps when copying back; if we use THCTensor_copy
// instead, it will recursively try and invoke ourselves to make
// oldC contiguous.
- THCudaTensor_copyIgnoringOverlaps(state, oldC, c);
- THCudaTensor_free(state, c);
+ TensorUtils<TensorTypeC>::copyIgnoringOverlaps(state, oldC, c);
+ TensorUtils<TensorTypeC>::free(state, c);
c = oldC;
}
diff --git a/lib/THC/THCGenerateAllTypes.h b/lib/THC/THCGenerateAllTypes.h
index 044cbdf..28037bd 100644
--- a/lib/THC/THCGenerateAllTypes.h
+++ b/lib/THC/THCGenerateAllTypes.h
@@ -2,6 +2,8 @@
#error "You must define THC_GENERIC_FILE before including THGenerateAllTypes.h"
#endif
+#include "THCHalf.h"
+
#define THCTypeIdxByte 1
#define THCTypeIdxChar 2
#define THCTypeIdxShort 3
@@ -12,10 +14,6 @@
#define THCTypeIdxHalf 8
#define THCTypeIdx_(T) TH_CONCAT_2(THCTypeIdx,T)
-#define hostreal real
-#define hostrealToReal(x) (x)
-#define realToHostreal(x) (x)
-
#define real unsigned char
#define accreal long
#define Real Byte
@@ -108,14 +106,7 @@
#undef CReal
#undef THC_REAL_IS_DOUBLE
-#if CUDA_VERSION >= 7050
-
-#undef hostreal
-#undef hostrealToReal
-#undef realToHostreal
-#define hostreal float
-#define hostrealToReal(x) THC_float2half(x);
-#define realToHostreal(x) THC_half2float(x);
+#ifdef CUDA_HALF_TENSOR
#define real half
#define accreal half
@@ -130,11 +121,7 @@
#undef CReal
#undef THC_REAL_IS_HALF
-#endif // CUDA_VERSION >= 7050
-
-#undef hostreal
-#undef hostrealToReal
-#undef realToHostreal
+#endif // CUDA_HALF_TENSOR
#undef THCTypeIdxByte
#undef THCTypeIdxChar
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index e368d2e..7847ef6 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -32,8 +32,25 @@ void THCHalf2Float(THCState *state, float *out, half *in, long len) {
float THC_half2float(half a)
{
- THError("half2float not implemented yet");
- return 0;
+ unsigned int bits = a.x & 0x7fff;
+ unsigned int sign = a.x & 0x8000;
+ unsigned int exp = a.x & 0x7c00;
+
+ bits <<= 13;
+ sign <<= 16;
+
+ bits += 0x38000000U;
+
+ // flush denormals to 0
+ bits = (exp == 0 ? 0 : bits) | sign;
+
+ union {
+ float f;
+ unsigned int v;
+ } conv;
+ conv.v = bits;
+
+ return conv.f;
}
/*
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index b20b05c..d87647b 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -3,7 +3,17 @@
#include "THCGeneral.h"
+// We compile with CudaHalfTensor support if we have this:
#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
+#define CUDA_HALF_TENSOR 1
+#endif
+
+// Native fp16 ALU instructions are available if we have this:
+#if defined(CUDA_HALF_TENSOR) && (__CUDA_ARCH__ >= 530)
+#define CUDA_HALF_INSTRUCTIONS 1
+#endif
+
+#ifdef CUDA_HALF_TENSOR
#include <cuda_fp16.h>
#include <stdint.h>
@@ -13,6 +23,6 @@ THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
THC_EXTERNC half THC_float2half(float a);
THC_EXTERNC float THC_half2float(half a);
-#endif
+#endif // CUDA_HALF_TENSOR
#endif
diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh
index 1060e89..89f599f 100644
--- a/lib/THC/THCReduce.cuh
+++ b/lib/THC/THCReduce.cuh
@@ -20,17 +20,21 @@ __device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() {
}
// Kernel that handles an entire reduction of a slice of a tensor per each thread
-template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims, int BDims>
+template <typename ModifyOp,
+ typename ReduceOp,
+ typename T,
+ typename IndexType,
+ int ADims, int BDims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
#endif
__global__ void
-THCudaTensor_reduceNoncontigDim(TensorInfo<IndexType> out,
- TensorInfo<IndexType> in,
+THCudaTensor_reduceNoncontigDim(TensorInfo<T, IndexType> out,
+ TensorInfo<T, IndexType> in,
IndexType reductionStride,
IndexType reductionSize,
IndexType totalSlices,
- float init,
+ T init,
ModifyOp modifyOp,
ReduceOp reduceOp) {
const IndexType sliceIndex = getReduceNoncontigDimSliceIndex<IndexType>();
@@ -42,13 +46,13 @@ THCudaTensor_reduceNoncontigDim(TensorInfo<IndexType> out,
// Each thread picks a point in `out` and `in` for which it is
// producing the reduction
const IndexType outOffset =
- IndexToOffset<IndexType, ADims>::get(sliceIndex, out);
+ IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
const IndexType inBaseOffset =
- IndexToOffset<IndexType, BDims>::get(sliceIndex, in);
+ IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
// For each point in reductionSize, reduce into `r`
IndexType inOffset = inBaseOffset;
- float r = init;
+ T r = init;
for (IndexType i = 0; i < reductionSize; ++i) {
r = reduceOp(r, modifyOp(in.data[inOffset]));
@@ -67,13 +71,17 @@ __device__ __forceinline__ IndexType getReduceContigDimSliceIndex() {
// Kernel that handles an entire reduction of a slice of a tensor per
// each block
-template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims, int BDims>
+template <typename ModifyOp,
+ typename ReduceOp,
+ typename T,
+ typename IndexType,
+ int ADims, int BDims>
__global__ void
-THCudaTensor_reduceContigDim(TensorInfo<IndexType> out,
- TensorInfo<IndexType> in,
+THCudaTensor_reduceContigDim(TensorInfo<T, IndexType> out,
+ TensorInfo<T, IndexType> in,
IndexType reductionSize,
IndexType totalSlices,
- float init,
+ T init,
ModifyOp modifyOp,
ReduceOp reduceOp) {
const IndexType sliceIndex = getReduceContigDimSliceIndex<IndexType>();
@@ -84,23 +92,23 @@ THCudaTensor_reduceContigDim(TensorInfo<IndexType> out,
// Get the offset in `out` for the reduction
const IndexType outOffset =
- IndexToOffset<IndexType, ADims>::get(sliceIndex, out);
+ IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
// Get the base offset in `in` for this block's reduction
const IndexType inBaseOffset =
- IndexToOffset<IndexType, BDims>::get(sliceIndex, in);
+ IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
// Each thread in the block will reduce some subset of elements in
// the slice. The elements are guaranteed contiguous starting at
// `inBaseOffset`.
- float r = init;
+ T r = init;
for (IndexType i = threadIdx.x; i < reductionSize; i += blockDim.x) {
r = reduceOp(r, modifyOp(in.data[inBaseOffset + i]));
}
// Reduce within the block
- extern __shared__ float smem[];
- r = reduceBlock<float, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
+ extern __shared__ T smem[];
+ r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
if (threadIdx.x == 0) {
// Write out reduced value
@@ -208,86 +216,92 @@ bool THCudaTensor_reduceDim(THCState* state,
// (or vice versa), the contiguous tensor can be collapsed to one
// dimension, and the loop to translate the linear index to the array
// index can be similarly collapsed. That is what this unrolling is for.
-#define HANDLE_CASE(TYPE, OUT, IN) \
+#define HANDLE_CASE(T, TYPE, OUT, IN) \
if (contigReduction) { \
- THCudaTensor_reduceContigDim<ModifyOp, ReduceOp, TYPE, OUT, IN> \
+ THCudaTensor_reduceContigDim<ModifyOp, ReduceOp, T, TYPE, OUT, IN> \
<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( \
outInfo, inInfo, reductionSize, \
(TYPE) outElements, init, modifyOp, reduceOp); \
} else { \
- THCudaTensor_reduceNoncontigDim<ModifyOp, ReduceOp, TYPE, OUT, IN> \
+ THCudaTensor_reduceNoncontigDim<ModifyOp, ReduceOp, T, TYPE, OUT, IN> \
<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
outInfo, inInfo, reductionStride, reductionSize, \
(TYPE) outElements, init, modifyOp, reduceOp); \
} \
-#define HANDLE_IN_CASE(TYPE, OUT, IN) \
- { \
- if (inInfo.isContiguous()) { \
- HANDLE_CASE(TYPE, OUT, -2); \
- } else { \
- switch (IN) { \
- case 1: \
- HANDLE_CASE(TYPE, OUT, 1); \
- break; \
- case 2: \
- HANDLE_CASE(TYPE, OUT, 2); \
- break; \
- case 3: \
- HANDLE_CASE(TYPE, OUT, 3); \
- break; \
- default: \
- HANDLE_CASE(TYPE, OUT, -1); \
- break; \
- } \
- } \
+#define HANDLE_IN_CASE(T, TYPE, OUT, IN) \
+ { \
+ if (inInfo.isContiguous()) { \
+ HANDLE_CASE(T, TYPE, OUT, -2); \
+ } else { \
+ switch (IN) { \
+ case 1: \
+ HANDLE_CASE(T, TYPE, OUT, 1); \
+ break; \
+ case 2: \
+ HANDLE_CASE(T, TYPE, OUT, 2); \
+ break; \
+ default: \
+ HANDLE_CASE(T, TYPE, OUT, -1); \
+ break; \
+ } \
+ } \
}
-#define HANDLE_OUT_CASE(TYPE, OUT, IN) \
- { \
- if (outInfo.isContiguous()) { \
- HANDLE_IN_CASE(TYPE, -2, IN); \
- } else { \
- switch (OUT) { \
- case 1: \
- HANDLE_IN_CASE(TYPE, 1, IN); \
- break; \
- case 2: \
- HANDLE_IN_CASE(TYPE, 2, IN); \
- break; \
- case 3: \
- HANDLE_IN_CASE(TYPE, 3, IN); \
- break; \
- default: \
- HANDLE_IN_CASE(TYPE, -1, IN); \
- break; \
- } \
- } \
+#define HANDLE_OUT_CASE(T, TYPE, OUT, IN) \
+ { \
+ if (outInfo.isContiguous()) { \
+ HANDLE_IN_CASE(T, TYPE, -2, IN); \
+ } else { \
+ switch (OUT) { \
+ case 1: \
+ HANDLE_IN_CASE(T, TYPE, 1, IN); \
+ break; \
+ case 2: \
+ HANDLE_IN_CASE(T, TYPE, 2, IN); \
+ break; \
+ case 3: \
+ HANDLE_IN_CASE(T, TYPE, 3, IN); \
+ break; \
+ default: \
+ HANDLE_IN_CASE(T, TYPE, -1, IN); \
+ break; \
+ } \
+ } \
}
- if (THC_canUse32BitIndexMath(state, out) &&
- THC_canUse32BitIndexMath(state, in)) {
- TensorInfo<unsigned int> outInfo(state, out);
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, out) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, in)) {
+ TensorInfo<float, unsigned int> outInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, out);
outInfo.collapseDims();
- TensorInfo<unsigned int> inInfo(state, in, dim);
+ TensorInfo<float, unsigned int> inInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, in);
+ inInfo.reduceDim(dim);
inInfo.collapseDims();
- HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims);
+ HANDLE_OUT_CASE(typename TensorUtils<THCudaTensor>::DataType,
+ unsigned int, outInfo.dims, inInfo.dims);
} else {
- TensorInfo<unsigned long> outInfo(state, out);
+ TensorInfo<float, unsigned long> outInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, out);
outInfo.collapseDims();
- TensorInfo<unsigned long> inInfo(state, in, dim);
+ TensorInfo<float, unsigned long> inInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, in);
+ inInfo.reduceDim(dim);
inInfo.collapseDims();
// For large tensors, we only compile the completely contiguous
// version and the completely generic version, to reduce
// compilation time.
if (outInfo.isContiguous() && inInfo.isContiguous()) {
- HANDLE_CASE(unsigned long, -2, -2);
+ HANDLE_CASE(typename TensorUtils<THCudaTensor>::DataType,
+ unsigned long, -2, -2);
} else {
- HANDLE_CASE(unsigned long, -1, -1);
+ HANDLE_CASE(typename TensorUtils<THCudaTensor>::DataType,
+ unsigned long, -1, -1);
}
}
#undef HANDLE_CASE
diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh
index cfe40fd..3fe81a9 100644
--- a/lib/THC/THCReduceAll.cuh
+++ b/lib/THC/THCReduceAll.cuh
@@ -18,24 +18,28 @@
#define THC_TWO_PASS_REDUCTION_SIZE 2048L
// Kernel that handles an entire reduction of a tensor in one pass
-template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims>
+template <typename ModifyOp,
+ typename ReduceOp,
+ typename T,
+ typename IndexType,
+ int ADims>
__global__ void
-THCudaTensor_reduceAll(TensorInfo<IndexType> in,
+THCudaTensor_reduceAll(TensorInfo<T, IndexType> in,
IndexType totalElements,
- float init,
+ T init,
ModifyOp modifyOp,
ReduceOp reduceOp,
- float* out) {
+ T* out) {
// With a block-wide stride, have each thread perform its own reduction.
- float r = init;
+ T r = init;
for (IndexType i = threadIdx.x; i < totalElements; i += blockDim.x) {
- const IndexType inOffset = IndexToOffset<IndexType, ADims>::get(i, in);
+ const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in);
r = reduceOp(r, modifyOp(in.data[inOffset]));
}
// Reduce within the block
- extern __shared__ float smem[];
- r = reduceBlock<float, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
+ extern __shared__ T smem[];
+ r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
if (threadIdx.x == 0) {
// Write out reduced value
@@ -56,27 +60,31 @@ __device__ __forceinline__ IndexType getEndIndex(IndexType totalSize) {
}
// Kernel that handles an entire reduction of a tensor in two passes
-template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims>
+template <typename ModifyOp,
+ typename ReduceOp,
+ typename T,
+ typename IndexType,
+ int ADims>
__global__ void
-THCudaTensor_reduceAllPass1(TensorInfo<IndexType> in,
+THCudaTensor_reduceAllPass1(TensorInfo<T, IndexType> in,
IndexType totalElements,
- float init,
+ T init,
ModifyOp modifyOp,
ReduceOp reduceOp,
- float* scratchSpace) {
+ T* scratchSpace) {
const IndexType startIndex = getStartIndex<IndexType>(totalElements);
const IndexType endIndex = getEndIndex<IndexType>(totalElements);
// With a block-wide stride, have each thread perform its own reduction.
- float r = init;
+ T r = init;
for (IndexType i = startIndex + threadIdx.x; i < endIndex; i += blockDim.x) {
- const IndexType inOffset = IndexToOffset<IndexType, ADims>::get(i, in);
+ const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in);
r = reduceOp(r, modifyOp(in.data[inOffset]));
}
// Reduce within the block
- extern __shared__ float smem[];
- r = reduceBlock<float, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
+ extern __shared__ T smem[];
+ r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
if (threadIdx.x == 0) {
// Write out block-wide reduced value
@@ -84,21 +92,21 @@ THCudaTensor_reduceAllPass1(TensorInfo<IndexType> in,
}
}
-template <typename ReduceOp, typename IndexType>
+template <typename ReduceOp, typename T, typename IndexType>
__global__ void
THCudaTensor_reduceAllPass2(int numPass1Blocks,
- float init,
+ T init,
ReduceOp reduceOp,
- float* scratchSpace,
- float* out) {
- float r = init;
+ T* scratchSpace,
+ T* out) {
+ T r = init;
if (threadIdx.x < numPass1Blocks) {
r = scratchSpace[threadIdx.x];
}
// Reduce within the block
- extern __shared__ float smem[];
- r = reduceBlock<float, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init);
+ extern __shared__ T smem[];
+ r = reduceBlock<T, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init);
if (threadIdx.x == 0) {
*out = r;
@@ -111,12 +119,13 @@ inline bool isTwoPassReductionSize(long elements) {
return (elements > THC_TWO_PASS_REDUCTION_SIZE);
}
+template <typename T>
inline long getTwoPassBlocks(THCState* state, long elements) {
long numBlocks = THCCeilDiv(elements, THC_REDUCE_ALL_BLOCK_SIZE);
// We can only have as many blocks as there is scratch space
long scratchSpace =
- THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(float);
+ THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(T);
THAssert(scratchSpace > 0);
if (numBlocks > scratchSpace) {
@@ -127,60 +136,67 @@ inline long getTwoPassBlocks(THCState* state, long elements) {
}
// Get the block/grid size that we want
+template <typename T>
inline void getPass1ReduceBlockGrid(THCState* state, long elements,
dim3& grid, dim3& block) {
- grid = dim3(getTwoPassBlocks(state, elements));
+ grid = dim3(getTwoPassBlocks<T>(state, elements));
block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
}
+template <typename T>
inline void getPass2ReduceBlockGrid(THCState* state, long elements,
dim3& grid, dim3& block) {
grid = dim3(1);
// We only need as many threads as there were blocks originally
- block = dim3(getTwoPassBlocks(state, elements));
+ block = dim3(getTwoPassBlocks<T>(state, elements));
}
+template <typename T>
inline void getSinglePassReduceBlockGrid(long elements,
dim3& grid, dim3& block) {
grid = dim3(1);
block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
}
-template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims>
+template <typename ModifyOp,
+ typename ReduceOp,
+ typename T,
+ typename IndexType,
+ int ADims>
void callReduceAll(THCState* state,
- const TensorInfo<IndexType>& in,
+ const TensorInfo<T, IndexType>& in,
long totalElements,
- float init,
+ T init,
const ModifyOp& modifyOp,
const ReduceOp& reduceOp,
- float* devOut) {
+ T* devOut) {
dim3 grid;
dim3 block;
if (isTwoPassReductionSize(totalElements)) {
- getPass1ReduceBlockGrid(state, totalElements, grid, block);
- size_t smemSize = block.x * sizeof(float);
+ getPass1ReduceBlockGrid<T>(state, totalElements, grid, block);
+ size_t smemSize = block.x * sizeof(T);
- THCudaTensor_reduceAllPass1<ModifyOp, ReduceOp, IndexType, ADims>
+ THCudaTensor_reduceAllPass1<ModifyOp, ReduceOp, T, IndexType, ADims>
<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
in, (IndexType) totalElements, init, modifyOp, reduceOp,
- (float*) THCState_getCurrentDeviceScratchSpace(state));
+ (T*) THCState_getCurrentDeviceScratchSpace(state));
int numPass1Blocks = grid.x;
- getPass2ReduceBlockGrid(state, totalElements, grid, block);
- smemSize = block.x * sizeof(float);
+ getPass2ReduceBlockGrid<T>(state, totalElements, grid, block);
+ smemSize = block.x * sizeof(T);
- THCudaTensor_reduceAllPass2<ReduceOp, IndexType>
+ THCudaTensor_reduceAllPass2<ReduceOp, T, IndexType>
<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
numPass1Blocks, init, reduceOp,
- (float*) THCState_getCurrentDeviceScratchSpace(state),
+ (T*) THCState_getCurrentDeviceScratchSpace(state),
devOut);
} else {
- getSinglePassReduceBlockGrid(totalElements, grid, block);
- size_t smemSize = block.x * sizeof(float);
+ getSinglePassReduceBlockGrid<T>(totalElements, grid, block);
+ size_t smemSize = block.x * sizeof(T);
- THCudaTensor_reduceAll<ModifyOp, ReduceOp, IndexType, ADims>
+ THCudaTensor_reduceAll<ModifyOp, ReduceOp, T, IndexType, ADims>
<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
in, (IndexType) totalElements, init, modifyOp, reduceOp, devOut);
}
@@ -224,8 +240,10 @@ bool THCudaTensor_reduceAll(THCState* state,
// dimension, and the loop to translate the linear index to the array
// index can be similarly collapsed. That is what this unrolling is for.
#define HANDLE_CASE(TYPE, IN) \
- callReduceAll<ModifyOp, ReduceOp, TYPE, IN>( \
- state, inInfo, inElements, init, modifyOp, reduceOp, devOut);
+ callReduceAll<ModifyOp, ReduceOp, \
+ typename TensorUtils<THCudaTensor>::DataType, \
+ TYPE, IN>( \
+ state, inInfo, inElements, init, modifyOp, reduceOp, devOut);
#define HANDLE_IN_CASE(TYPE, IN) \
{ \
@@ -249,13 +267,15 @@ bool THCudaTensor_reduceAll(THCState* state,
} \
}
- if (THC_canUse32BitIndexMath(state, in)) {
- TensorInfo<unsigned int> inInfo(state, in);
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, in)) {
+ TensorInfo<float, unsigned int> inInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, in);
inInfo.collapseDims();
HANDLE_IN_CASE(unsigned int, inInfo.dims);
} else {
- TensorInfo<unsigned long long> inInfo(state, in);
+ TensorInfo<float, unsigned long long> inInfo =
+ getTensorInfo<THCudaTensor, unsigned long long>(state, in);
inInfo.collapseDims();
// For large tensors, we only compile the completely contiguous
diff --git a/lib/THC/THCReduceApplyUtils.cu b/lib/THC/THCReduceApplyUtils.cu
index 10d5d89..f92bb99 100644
--- a/lib/THC/THCReduceApplyUtils.cu
+++ b/lib/THC/THCReduceApplyUtils.cu
@@ -11,29 +11,6 @@ void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg) {
THArgCheck(dims <= MAX_CUTORCH_DIMS, arg, CUTORCH_DIM_WARNING);
}
-bool THC_canUse32BitIndexMath(THCState* state, THCudaTensor* t) {
- long elements = THCudaTensor_nElement(state, t);
- if (elements >= UINT_MAX) {
- return false;
- }
-
- long offset = 0;
- long linearId = elements - 1;
-
- for (int i = THCudaTensor_nDimension(state, t) - 1; i >= 0; --i) {
- long curDimIndex = linearId % THCudaTensor_size(state, t, i);
- long curDimOffset = curDimIndex * THCudaTensor_stride(state, t, i);
- offset += curDimOffset;
- linearId /= THCudaTensor_size(state, t, i);
- }
-
- if (offset >= UINT_MAX) {
- return false;
- }
-
- return true;
-}
-
bool THC_getGridFromTiles(long gridTiles, dim3& grid) {
if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) {
return false;
@@ -56,73 +33,3 @@ bool THC_getGridFromTiles(long gridTiles, dim3& grid) {
grid = dim3(gridX, gridY, gridZ);
return true;
}
-
-namespace {
-
-struct SizeAndStride {
- long size;
- long stride;
-};
-
-int compareSizeAndStride(const void* a, const void* b) {
- const SizeAndStride* aS = (const SizeAndStride*) a;
- const SizeAndStride* bS = (const SizeAndStride*) b;
-
- return aS->stride < bS->stride;
-}
-
-}
-
-bool THC_overlappingIndices(THCState* state, THCudaTensor* t) {
- // In this function, we don't care about permutations of the
- // size/stride arrays (transpositions).
- // We order the size/stride arrays by stride, skipping dimensions of
- // size 1. Strides of dimensions of size 1 don't matter, since there
- // is only one addressing point in them.
- // In this reordered view, the tensor is contiguous if
- // stride[dim] == size[dim + 1] * stride[dim + 1] for all `dim`.
- // The tensor has holes if
- // stride[dim] > size[dim + 1] * stride[dim + 1] for one or more
- // `dim`.
- // The tensor has overlaps if
- // stride[dim] < size[dim + 1] * stride[dim + 1] for one or more
- // `dim`, or the innermost stride is 0.
-
- // Extract size/stride arrays; only consider size >1 dims.
- SizeAndStride info[MAX_CUTORCH_DIMS];
-
- int dims = THCudaTensor_nDimension(state, t);
- int nonSize1Dims = 0;
- for (int i = 0; i < dims; ++i) {
- long size = THCudaTensor_size(state, t, i);
- if (size > 1) {
- info[nonSize1Dims].size = size;
- info[nonSize1Dims].stride = THCudaTensor_stride(state, t, i);
- ++nonSize1Dims;
- }
- }
-
- if (nonSize1Dims == 0) {
- // no overlap
- return false;
- }
-
- // Ascending order (innermost dimension in sorted view is at [0])
- qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride);
-
- // Base case: innermost dimension must have stride >= 1
- if (info[nonSize1Dims - 1].stride < 1) {
- return true;
- }
-
- // Subsequent dimensions, if any
- for (int i = nonSize1Dims - 2; i >= 0; --i) {
- if (info[i].stride < info[i + 1].size * info[i + 1].stride) {
- // There are overlaps
- return true;
- }
- }
-
- // Tensor has holes or is contiguous
- return false;
-}
diff --git a/lib/THC/THCReduceApplyUtils.cuh b/lib/THC/THCReduceApplyUtils.cuh
index d5ac2b5..06f969f 100644
--- a/lib/THC/THCReduceApplyUtils.cuh
+++ b/lib/THC/THCReduceApplyUtils.cuh
@@ -6,284 +6,12 @@
#include "THCGeneral.h"
#include "THCTensor.h"
#include "THCDeviceUtils.cuh"
-
-// Maximum number of dimensions allowed for cutorch
-#define MAX_CUTORCH_DIMS 25
-
-// Warning string for tensor arguments that are too large or have too
-// many dimensions
-#define CUTORCH_STR(X) #X
-#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
- CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
+#include "THCTensorInfo.cuh"
// Enum that indicates whether tensor arguments are read/write or
// read-only
enum TensorArgType { ReadWrite, ReadOnly };
-// Copy operator for the pointwise apply kernel
-template <typename T>
-struct CopyOp {
- __device__ __forceinline__ void operator()(T* dst, T* src) {
-#if __CUDA_ARCH__ >= 350
- *dst = __ldg(src);
-#else
- *dst = *src;
-#endif
- }
-};
-
-// CUDA kernel argument that defines tensor layout
-template <typename IndexType>
-struct TensorInfo {
- // Extracts size/stride information for the kernel.
- // The optional `reduceDim` indicates a reduction dimension for the
- // given tensor, so that the resulting size for this dimension will be 1.
- TensorInfo(THCState* state, THCudaTensor* t, int reduceDim = -1);
-
- // Collapses all runs of successive dimensions if the size/strides
- // match up within the run and there are no holes between the
- // dimensions.
- // If excludeDim is set (not -1), then excludeDim will not be
- // collapsed with any other dimension.
- // Function returns the new dimension index that excludeDim maps to,
- // since the collapsed dimensions are <= the input dimensions.
- int collapseDims(int excludeDim = -1);
-
- // Contiguous tensors of more than one dimension are collapsed down
- // to one tensor
- __host__ __device__ inline bool isContiguous() const {
- return (dims == 1 && strides[0] == 1);
- }
-
- float* data;
- IndexType sizes[MAX_CUTORCH_DIMS];
- IndexType strides[MAX_CUTORCH_DIMS];
- int dims;
-};
-
-template <typename IndexType>
-TensorInfo<IndexType>::TensorInfo(THCState* state,
- THCudaTensor* t,
- int reduceDim) {
- data = THCudaTensor_data(state, t);
- dims = THCudaTensor_nDimension(state, t);
- assert(dims <= MAX_CUTORCH_DIMS);
-
- for (int i = 0; i < dims; ++i) {
- sizes[i] = THCudaTensor_size(state, t, i);
- strides[i] = THCudaTensor_stride(state, t, i);
- }
-
- assert(reduceDim == -1 || (reduceDim < dims && reduceDim >= 0));
-
- if (reduceDim != -1) {
- sizes[reduceDim] = 1;
- }
-}
-
-template <typename IndexType>
-int
-TensorInfo<IndexType>::collapseDims(int excludeDim) {
- // Find the innermost dimension not of size 1, since dimensions of size 1 are
- // collapsible.
- int firstNonOneDim = -1;
-
- for (int i = dims - 1; i >= 0; --i) {
- if (i == excludeDim) {
- // We cannot collapse this dimension, even if it is size 1
- firstNonOneDim = i;
- break;
- }
-
- if (sizes[i] != 1) {
- firstNonOneDim = i;
- break;
- }
- }
-
- // Special case: if all dimensions are of size 1, then this is a
- // single-point tensor that we still have to operate on. Reduce to a
- // single point.
- if (firstNonOneDim == -1) {
- assert(excludeDim == -1);
-
- dims = 1;
- sizes[0] = 1;
- strides[0] = 1;
-
- // Everything effectively got collapsed into this dimension
- return 0;
- }
-
- // Count the number of successive dimensions that can be collapsed, from
- // innermost to outermost.
- int numCollapsed = 0;
-
- // Skip the leading size 1 dims
- numCollapsed += dims - 1 - firstNonOneDim;
-
- // We perform one pass through to determine how many dimensions we
- // can collapse, before calculating the actual size of the collapsed
- // dimensions.
- // size/strideInner are the size/strides of the previous inner
- // non-collapsible dim we encounter.
- long sizeInner = sizes[firstNonOneDim];
- long strideInner = strides[firstNonOneDim];
-
- for (int i = firstNonOneDim - 1; i >= 0; --i) {
- long sizeOuter = sizes[i];
- long strideOuter = strides[i];
-
- // Don't collapse this dimension if we want to exclude it from
- // collapsing.
- // Since this code is attempting to collapse a subsequent
- // dimension (i) with the preceding dimension (i + 1), we can only
- // perform collapsing if the preceding dimension can be collapsed
- // (i.e., not excludeDim)
- if ((excludeDim != i) && (excludeDim != i + 1)) {
- // The next outermost dimension can be skipped if size 1
- if (sizeOuter == 1) {
- ++numCollapsed;
- continue;
- }
-
- // If the next outermost dimension is contiguous with the
- // previous non-collapsed one, collapse it
- if (strideOuter == strideInner * sizeInner) {
- ++numCollapsed;
-
- // This is the run of collapsed dimensions' size
- sizeInner = sizeInner * sizeOuter;
- continue;
- }
- }
-
- // Otherwise, this new outer dimension at `i` cannot be collapsed
- // because it is excluded from collapsing, or it is not contiguous
- // with the previous inner dimension.
- sizeInner = sizeOuter;
- strideInner = strideOuter;
- }
-
- // This will be our new size/stride and dimension.
- IndexType newSizes[MAX_CUTORCH_DIMS];
- IndexType newStrides[MAX_CUTORCH_DIMS];
-
- assert(numCollapsed < dims);
- int newDims = dims - numCollapsed;
-
- // We return the index of the excluded dimension that is excluded
- // from being collapsed here.
- int returnDim = -1;
-
- // We perform a second pass through the dimensions to actually
- // calculate the size of the collapsed dimensions.
- int collapsedIndex = dims - numCollapsed - 1;
- newSizes[collapsedIndex] = sizes[firstNonOneDim];
- newStrides[collapsedIndex] = strides[firstNonOneDim];
-
- if (firstNonOneDim == excludeDim) {
- returnDim = collapsedIndex;
- }
-
- for (int i = firstNonOneDim - 1; i >= 0; --i) {
- IndexType sizeOuter = sizes[i];
- IndexType strideOuter = strides[i];
-
- if ((excludeDim != i) && (excludeDim != i + 1)) {
- if (sizeOuter == 1) {
- // skip
- continue;
- }
-
- if (strideOuter == newSizes[collapsedIndex] * newStrides[collapsedIndex]) {
- // collapse
- newSizes[collapsedIndex] *= sizeOuter;
- continue;
- }
- }
-
- // Otherwise, strides don't match, or dim `i` is excluded from
- // collapsing.
- --collapsedIndex;
- assert(collapsedIndex >= 0);
- assert(collapsedIndex < newDims);
- newSizes[collapsedIndex] = sizeOuter;
- newStrides[collapsedIndex] = strideOuter;
-
- if (excludeDim == i) {
- returnDim = collapsedIndex;
- }
- }
-
- // We must have filled all the dimensions we're looking for
- assert(collapsedIndex == 0);
- assert((excludeDim == -1) || (returnDim != -1));
-
- dims = newDims;
-
- for (int i = 0; i < dims; ++i) {
- sizes[i] = newSizes[i];
- strides[i] = newStrides[i];
- }
-
- // After collapsing, the original `excludeDim` may have been
- // renumbered to this new `returnDim`, since some dimensions could
- // have been collapsed.
- return returnDim;
-}
-
-// Translate a linear index for the apply to a float* offset;
-// specialized on `Dims` to reduce nvcc compilation time
-template <typename IndexType, int Dims>
-struct IndexToOffset {
- static __host__ __device__ IndexType get(
- IndexType linearId,
- const TensorInfo<IndexType>& info) {
- IndexType offset = 0;
-
- // Use static dims
- for (int i = Dims - 1; i >= 0; --i) {
- IndexType curDimIndex = linearId % info.sizes[i];
- IndexType curDimOffset = curDimIndex * info.strides[i];
- offset += curDimOffset;
-
- if (i > 0) {
- linearId /= info.sizes[i];
- }
- }
-
- return offset;
- }
-};
-
-template <typename IndexType>
-struct IndexToOffset<IndexType, -2> {
- static __forceinline__ __host__ __device__ IndexType
- get(IndexType linearId, const TensorInfo<IndexType>& info) {
- return linearId;
- }
-};
-
-template <typename IndexType>
-struct IndexToOffset<IndexType, -1> {
- static __forceinline__ __host__ __device__ IndexType
- get(IndexType linearId, const TensorInfo<IndexType>& info) {
- IndexType offset = 0;
-
- // Use dynamic dims
- for (int i = info.dims - 1; i >= 0; --i) {
- IndexType curDimIndex = linearId % info.sizes[i];
- IndexType curDimOffset = curDimIndex * info.strides[i];
- offset += curDimOffset;
-
- linearId /= info.sizes[i];
- }
-
- return offset;
- }
-};
-
template <typename IndexType>
__device__ __forceinline__ IndexType getLinearBlockId() {
return blockIdx.z * gridDim.y * gridDim.x +
@@ -347,16 +75,7 @@ __device__ T reduceBlock(T* smem,
// Make sure the given tensor doesn't have too many dimensions
void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg);
-// Returns true if all linear ID -> offset math can be performed using 32 bit
-// unsigned math, which is faster than 64 bit math
-THC_API bool THC_canUse32BitIndexMath(THCState* state, THCudaTensor* t);
-
// Produces a grid with at least one point per tile
THC_API bool THC_getGridFromTiles(long gridTiles, dim3& grid);
-// Determines if the given tensor has overlapping data points (i.e.,
-// is there more than one index into the tensor that references the
-// same piece of data)?
-THC_API bool THC_overlappingIndices(THCState* state, THCudaTensor* t);
-
#endif // THC_REDUCE_APPLY_UTILS_INC
diff --git a/lib/THC/THCSortUtils.cuh b/lib/THC/THCSortUtils.cuh
index 42a8018..baf4a8e 100644
--- a/lib/THC/THCSortUtils.cuh
+++ b/lib/THC/THCSortUtils.cuh
@@ -2,6 +2,7 @@
#define THC_SORT_UTILS_INC
#include "THCReduceApplyUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
// Collection of kernel sort routines
template <typename T>
@@ -91,11 +92,11 @@ template <typename K, typename V,
int KeyDims, int ValueDims,
typename Comparator, typename IndexType, int Power2SortSize>
__global__ void
-bitonicSortKVInPlace(TensorInfo<IndexType> keys,
+bitonicSortKVInPlace(TensorInfo<K, IndexType> keys,
IndexType keySlices,
IndexType keySliceSize,
IndexType keySliceStride,
- TensorInfo<IndexType> values,
+ TensorInfo<V, IndexType> values,
IndexType valueSliceStride,
const Comparator& comp) {
// Find the slice of the tensor that we are sorting
@@ -111,9 +112,9 @@ bitonicSortKVInPlace(TensorInfo<IndexType> keys,
__shared__ bool sharedValid[Power2SortSize];
const IndexType keyStartOffset =
- IndexToOffset<IndexType, KeyDims>::get(linearIndex, keys);
+ IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys);
const IndexType valueStartOffset =
- IndexToOffset<IndexType, ValueDims>::get(linearIndex, values);
+ IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values);
// If the sort size is 1, the data is already sorted
if (Power2SortSize == 1) {
diff --git a/lib/THC/THCStorageCopy.h b/lib/THC/THCStorageCopy.h
index ec8011d..837056f 100644
--- a/lib/THC/THCStorageCopy.h
+++ b/lib/THC/THCStorageCopy.h
@@ -3,6 +3,7 @@
#include "THCStorage.h"
#include "THCGeneral.h"
+#include "THCHalf.h"
#include "generic/THCStorageCopy.h"
#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu
index f6545a6..ea38d2e 100644
--- a/lib/THC/THCTensorCopy.cu
+++ b/lib/THC/THCTensorCopy.cu
@@ -7,5 +7,226 @@ inline int curGPU() {
return curDev;
}
+// Copy operator for the pointwise apply kernel
+template <typename TypeDst, typename TypeSrc>
+struct CopyOp {
+ __device__ __forceinline__ void operator()(TypeDst* dst, TypeSrc* src) {
+#if __CUDA_ARCH__ >= 350
+ *dst = ScalarConvert<TypeSrc, TypeDst>::to(__ldg(src));
+#else
+ *dst = ScalarConvert<TypeSrc, TypeDst>::to(*src);
+#endif
+ }
+};
+
+// Copy for the same type to the same type
+template <typename TensorTypeDst, typename TensorTypeSrc>
+void
+THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) {
+ long totalElements = TensorUtils<TensorTypeDst>::getNumElements(state, dst);
+
+ THArgCheck(totalElements ==
+ TensorUtils<TensorTypeSrc>::getNumElements(state, src),
+ 2, "sizes do not match");
+
+ if (TensorUtils<TensorTypeDst>::getDims(state, dst) == 0) {
+ // Zero-dim tensor; copy nothing
+ return;
+ }
+
+ // We can memcpy the memory if:
+ // -both tensors are contiguous; or,
+ // -there is only one element to copy; or,
+ // -FIXME: if both tensors have matching size and stride arrays, and no
+ // holes within (in other words, there is some permutation that can be applied
+ // to the size/strides such that the resulting tensor is
+ // contiguous).
+ // -AND: both tensors have the same type.
+ bool sameType = isSameType<TensorTypeSrc, TensorTypeDst>();
+ bool srcContig = TensorUtils<TensorTypeSrc>::isContiguous(state, src);
+ bool dstContig = TensorUtils<TensorTypeDst>::isContiguous(state, dst);
+ bool memcpyEligible =
+ ((srcContig && dstContig) || (totalElements == 1)) && sameType;
+
+
+ int srcDev = TensorUtils<TensorTypeSrc>::getDevice(state, src);
+ int dstDev = TensorUtils<TensorTypeDst>::getDevice(state, dst);
+ int oldDev = curGPU();
+
+ // We always perform the copy on the source device, using the
+ // current stream on the source device.
+ // If the copy is on the default stream, then we fully synchronize
+ // both src and dst's default streams for completion of the
+ // copy. We have to explicitly do this for non-contig copies.
+ // This mimics the behavior of cross-device cudaMemcpyAsync on
+ // the default stream.
+ // If the copy is not on the default stream, then it is up to the
+ // user to add needed synchronization on the dst device, since the
+ // stream on the dst device that wishes to synchronize may not be
+ // the same index as the one on the src device.
+ int copyStreamIndex =
+ THCState_getCurrentStreamIndex(state);
+ cudaStream_t copyStream =
+ THCState_getDeviceStream(state, srcDev, copyStreamIndex);
+
+ if (srcDev != dstDev && copyStreamIndex == 0) {
+ // This is a cross-device copy on the default stream. We perform a
+ // two-way barrier between both devices' default streams before
+ // the copy. This ensures that any write-after-write and
+ // write-after-read dependencies on the destination side are
+ // handled, so that no one is operating on the dst memory when
+ // we perform the copy.
+ // src waits on dst barrier (src already waits on src)
+ cudaEvent_t dstReady;
+ THCudaCheck(cudaSetDevice(dstDev));
+ THCudaCheck(cudaEventCreateWithFlags(&dstReady, cudaEventDisableTiming));
+ THCudaCheck(cudaEventRecord(dstReady, NULL));
+
+ THCudaCheck(cudaSetDevice(srcDev));
+ THCudaCheck(cudaStreamWaitEvent(NULL, dstReady, 0));
+ THCudaCheck(cudaEventDestroy(dstReady));
+ } else if (srcDev != oldDev) {
+ THCudaCheck(cudaSetDevice(srcDev));
+ }
+
+ // We are now on srcDev
+ if (memcpyEligible) {
+ // Perform the copy
+ THCudaCheck(cudaMemcpyAsync(
+ TensorUtils<TensorTypeDst>::getData(state, dst),
+ TensorUtils<TensorTypeSrc>::getData(state, src),
+ totalElements *
+ sizeof(typename TensorUtils<TensorTypeDst>::DataType),
+ cudaMemcpyDeviceToDevice,
+ copyStream));
+ } else {
+ // Non-contiguous copy or a type-conversion copy
+
+ // We avoid creating temporary memory copies if possible.
+ // If both src and dst are on the same device, or if they are on
+ // different devices and p2p access is enabled, perform the copy
+ // by a pointwise copy kernel.
+ // Otherwise, we'll have to make contiguous (which will in fact
+ // invoke copy() again), and then perform the copy.
+ // FIXME: might want to consider only running the pointwise kernel
+ // if both src and dst innermost dimensions are contiguous. If
+ // they are not, then taking the hit of the memory allocation/free
+ // might be worth it to avoid non-coalesced reads or writes.
+
+ // A device always has access to itself, so this also handles the
+ // case srcDev == dstDev
+ if (THCState_getPeerToPeerAccess(state, srcDev, dstDev)) {
+ // Make sure we have the current stream set in THCState, since
+ // pointwise uses that
+ if (srcDev != oldDev) {
+ THCState_setStream(state, srcDev, copyStreamIndex);
+ }
+
+ bool succ =
+ THC_pointwiseApply2(
+ state, dst, src,
+ CopyOp<typename TensorUtils<TensorTypeDst>::DataType,
+ typename TensorUtils<TensorTypeSrc>::DataType>());
+
+ // Restore prior THCState stream
+ if (srcDev != oldDev) {
+ THCState_setStream(state, oldDev, copyStreamIndex);
+ }
+
+ THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
+ } else {
+ // GPUs can't access each other directly, but the tensors
+ // involved are non-contiguous and/or are different types.
+
+ // Make sure the src is contiguous and in the same type as dst
+ THCudaCheck(cudaSetDevice(srcDev));
+ TensorTypeDst* srcContig = NULL;
+
+ if (sameType) {
+ srcContig =
+ (TensorTypeDst*) // this is actually the same type as src
+ TensorUtils<TensorTypeSrc>::newContiguous(state, src);
+
+ } else {
+ // Types are different
+ // Copy into the new format, contiguous, on the source device
+ srcContig = TensorUtils<TensorTypeDst>::newTensor(state);
+ TensorUtils<TensorTypeDst>::resizeAs(state, srcContig, dst);
+
+ if (srcDev != oldDev) {
+ THCState_setStream(state, srcDev, copyStreamIndex);
+ }
+
+ bool succ =
+ THC_pointwiseApply2(
+ state, srcContig, src,
+ CopyOp<typename TensorUtils<TensorTypeDst>::DataType,
+ typename TensorUtils<TensorTypeSrc>::DataType>());
+
+ // Restore prior THCState stream
+ if (srcDev != oldDev) {
+ THCState_setStream(state, oldDev, copyStreamIndex);
+ }
+
+ THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
+ }
+
+ // Make sure the dst is contiguous
+ THCudaCheck(cudaSetDevice(dstDev));
+ TensorTypeDst* dstContig =
+ TensorUtils<TensorTypeDst>::newContiguous(state, dst);
+
+ // Now, we are ready for a cross-device memcpy of contiguous
+ // data, of the same layout and type
+ THCudaCheck(cudaSetDevice(srcDev));
+
+ THCudaCheck(cudaMemcpyAsync(
+ TensorUtils<TensorTypeDst>::getData(state, dstContig),
+ TensorUtils<TensorTypeDst>::getData(state, srcContig),
+ totalElements *
+ sizeof(typename TensorUtils<TensorTypeDst>::DataType),
+ cudaMemcpyDeviceToDevice,
+ copyStream));
+
+ // We are done with the src
+ TensorUtils<TensorTypeDst>::free(state, srcContig);
+
+ if (dst != dstContig) {
+ TensorUtils<TensorTypeDst>::freeCopyTo(state, dstContig, dst);
+ } else {
+ TensorUtils<TensorTypeDst>::free(state, dstContig);
+ }
+
+ // We're still on srcDev at this point
+ }
+ }
+
+ if (srcDev != dstDev && copyStreamIndex == 0) {
+ // dst waits on src barrier (dst already waits on dst). We cannot
+ // operate on dst's copy until the copy is complete.
+
+ // Still on srcDev, record default stream event
+ cudaEvent_t srcReady;
+ THCudaCheck(cudaEventCreateWithFlags(&srcReady, cudaEventDisableTiming));
+ THCudaCheck(cudaEventRecord(srcReady, NULL));
+
+ THCudaCheck(cudaSetDevice(dstDev));
+ THCudaCheck(cudaStreamWaitEvent(NULL, srcReady, 0));
+ THCudaCheck(cudaEventDestroy(srcReady));
+
+ // We are now on dstDev (right above). Restore prior device from dst
+ if (dstDev != oldDev) {
+ THCudaCheck(cudaSetDevice(oldDev));
+ }
+ } else {
+ // We are still on srcDev. Restore prior device from src
+ if (srcDev != oldDev) {
+ THCudaCheck(cudaSetDevice(oldDev));
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
#include "generic/THCTensorCopy.cu"
#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h
index fc206cb..e8bc4f4 100644
--- a/lib/THC/THCTensorCopy.h
+++ b/lib/THC/THCTensorCopy.h
@@ -3,6 +3,7 @@
#include "THCTensor.h"
#include "THCGeneral.h"
+#include "THCHalf.h"
#include "generic/THCTensorCopy.h"
#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu
index a23daa9..e0d721f 100644
--- a/lib/THC/THCTensorIndex.cu
+++ b/lib/THC/THCTensorIndex.cu
@@ -15,13 +15,13 @@
// indexCopyLargeIndex kernel is a better choice to increase
// parallelism.
template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexCopySmallIndex(TensorInfo<IndexType> dst,
- TensorInfo<IndexType> src,
- TensorInfo<IndexType> indices,
- int dstCopyDim,
- int srcCopyDim,
- IndexType innerSize,
- long dstCopyDimSize) {
+__global__ void indexCopySmallIndex(TensorInfo<float, IndexType> dst,
+ TensorInfo<float, IndexType> src,
+ TensorInfo<float, IndexType> indices,
+ int dstCopyDim,
+ int srcCopyDim,
+ IndexType innerSize,
+ long dstCopyDimSize) {
// In order to avoid reloading the index that we are copying, load
// it once to handle all of the points that are being selected, so
// it can be reused as much as possible. This kernel is chosen when
@@ -30,7 +30,7 @@ __global__ void indexCopySmallIndex(TensorInfo<IndexType> dst,
for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
// Lua indices begin at 1
IndexType dstIndex =
- indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1;
+ indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1;
if (dstIndex < dstCopyDimSize) {
// We stride over the output ignoring the indexed dimension
@@ -39,12 +39,12 @@ __global__ void indexCopySmallIndex(TensorInfo<IndexType> dst,
linearIndex < innerSize;
linearIndex += gridDim.x * blockDim.x) {
IndexType dstOffset =
- IndexToOffset<IndexType, DstDim>::get(linearIndex, dst);
+ IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst);
dstOffset += dstIndex * dst.strides[dstCopyDim];
IndexType srcOffset =
- IndexToOffset<IndexType, SrcDim>::get(linearIndex, src);
+ IndexToOffset<float, IndexType, SrcDim>::get(linearIndex, src);
srcOffset += srcIndex * src.strides[srcCopyDim];
dst.data[dstOffset] = src.data[srcOffset];
@@ -60,13 +60,13 @@ __global__ void indexCopySmallIndex(TensorInfo<IndexType> dst,
// indexCopySmallIndex kernel is a better choice to reduce memory
// accesses.
template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexCopyLargeIndex(TensorInfo<IndexType> dst,
- TensorInfo<IndexType> src,
- TensorInfo<IndexType> indices,
- int dstCopyDim,
- int srcCopyDim,
- IndexType innerSize,
- long dstCopyDimSize) {
+__global__ void indexCopyLargeIndex(TensorInfo<float, IndexType> dst,
+ TensorInfo<float, IndexType> src,
+ TensorInfo<float, IndexType> indices,
+ int dstCopyDim,
+ int srcCopyDim,
+ IndexType innerSize,
+ long dstCopyDimSize) {
// We stride over the output including the indexed dimension
// (totalSize), and calculate the destination index point based on that
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
@@ -77,15 +77,15 @@ __global__ void indexCopyLargeIndex(TensorInfo<IndexType> dst,
// Lua indices begin at 1
IndexType dstIndex =
- indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1;
+ indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1;
if (dstIndex < dstCopyDimSize) {
IndexType dstOffset =
- IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst);
+ IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst);
dstOffset += dstIndex * dst.strides[dstCopyDim];
IndexType srcOffset =
- IndexToOffset<IndexType, SrcDim>::get(elementInSlice, src);
+ IndexToOffset<float, IndexType, SrcDim>::get(elementInSlice, src);
srcOffset += srcIndex * src.strides[srcCopyDim];
dst.data[dstOffset] = src.data[srcOffset];
@@ -138,13 +138,13 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT
int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
#define SMALL_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
- indexCopySmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
+ indexCopySmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
<<<smallIndexGrid, smallIndexBlock, 0, stream>>>( \
dstInfo, srcInfo, indicesInfo, \
dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize);
#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
- indexCopyLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
+ indexCopyLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
<<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \
dstInfo, srcInfo, indicesInfo, \
dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize);
@@ -155,18 +155,21 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT
dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, 128L), (long)(mpc * 8)));
dim3 largeIndexBlock(std::min(srcTotalSize, 128L));
- if (THC_canUse32BitIndexMath(state, dst) &&
- THC_canUse32BitIndexMath(state, src) &&
- THC_canUse32BitIndexMath(state, indices)) {
- TensorInfo<unsigned int> dstInfo(state, dst);
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) {
+ TensorInfo<float, unsigned int> dstInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, dst);
int dstCopyDim = dstInfo.collapseDims(dim);
- dstInfo.sizes[dstCopyDim] = 1;
+ dstInfo.reduceDim(dstCopyDim);
- TensorInfo<unsigned int> srcInfo(state, src);
+ TensorInfo<float, unsigned int> srcInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, src);
int srcCopyDim = srcInfo.collapseDims(dim);
- srcInfo.sizes[srcCopyDim] = 1;
+ srcInfo.reduceDim(srcCopyDim);
- TensorInfo<unsigned int> indicesInfo(state, indices);
+ TensorInfo<float, unsigned int> indicesInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, indices);
indicesInfo.collapseDims();
// A reasonable choice for when to have each thread iterate over
@@ -193,15 +196,18 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT
}
}
} else {
- TensorInfo<unsigned long> dstInfo(state, dst);
+ TensorInfo<float, unsigned long> dstInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, dst);
int dstCopyDim = dstInfo.collapseDims(dim);
- dstInfo.sizes[dstCopyDim] = 1;
+ dstInfo.reduceDim(dstCopyDim);
- TensorInfo<unsigned long> srcInfo(state, src);
+ TensorInfo<float, unsigned long> srcInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, src);
int srcCopyDim = srcInfo.collapseDims(dim);
- srcInfo.sizes[srcCopyDim] = 1;
+ srcInfo.reduceDim(srcCopyDim);
- TensorInfo<unsigned long> indicesInfo(state, indices);
+ TensorInfo<float, unsigned long> indicesInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, indices);
indicesInfo.collapseDims();
LARGE_INDEX(unsigned long, -1, -1, -1);
@@ -218,13 +224,13 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT
// indexAddLargeIndex kernel is a better choice to increase
// parallelism.
template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexAddSmallIndex(TensorInfo<IndexType> dst,
- TensorInfo<IndexType> src,
- TensorInfo<IndexType> indices,
- int dstAddDim,
- int srcAddDim,
- IndexType innerSize,
- long dstAddDimSize) {
+__global__ void indexAddSmallIndex(TensorInfo<float, IndexType> dst,
+ TensorInfo<float, IndexType> src,
+ TensorInfo<float, IndexType> indices,
+ int dstAddDim,
+ int srcAddDim,
+ IndexType innerSize,
+ long dstAddDimSize) {
// In order to avoid reloading the index that we are copying, load
// it once to handle all of the points that are being selected, so
// it can be reused as much as possible. This kernel is chosen when
@@ -233,7 +239,7 @@ __global__ void indexAddSmallIndex(TensorInfo<IndexType> dst,
for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
// Lua indices begin at 1
IndexType dstIndex =
- indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1;
+ indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1;
if (dstIndex < dstAddDimSize) {
// We stride over the output ignoring the indexed dimension
@@ -242,11 +248,11 @@ __global__ void indexAddSmallIndex(TensorInfo<IndexType> dst,
linearIndex < innerSize;
linearIndex += gridDim.x * blockDim.x) {
IndexType dstOffset =
- IndexToOffset<IndexType, DstDim>::get(linearIndex, dst);
+ IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst);
dstOffset += dstIndex * dst.strides[dstAddDim];
IndexType srcOffset =
- IndexToOffset<IndexType, SrcDim>::get(linearIndex, src);
+ IndexToOffset<float, IndexType, SrcDim>::get(linearIndex, src);
srcOffset += srcIndex * src.strides[srcAddDim];
atomicAdd(&dst.data[dstOffset], src.data[srcOffset]);
@@ -262,32 +268,32 @@ __global__ void indexAddSmallIndex(TensorInfo<IndexType> dst,
// indexAddSmallIndex kernel is a better choice to reduce memory
// accesses.
template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexAddLargeIndex(TensorInfo<IndexType> dst,
- TensorInfo<IndexType> src,
- TensorInfo<IndexType> indices,
- int dstAddDim,
- int srcAddDim,
- IndexType innerSize,
- long dstAddDimSize) {
+__global__ void indexAddLargeIndex(TensorInfo<float, IndexType> dst,
+ TensorInfo<float, IndexType> src,
+ TensorInfo<float, IndexType> indices,
+ int dstAddDim,
+ int srcAddDim,
+ IndexType innerSize,
+ long dstAddDimSize) {
// We stride over the output including the indexed dimension
// (totalSize), and calculate the destination index point based on that
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
- linearIndex < innerSize * indices.sizes[0];
+ linearIndex < innerSize * indices.sizes[0];
linearIndex += gridDim.x * blockDim.x) {
IndexType srcIndex = linearIndex / innerSize;
IndexType elementInSlice = linearIndex % innerSize;
// Lua indices begin at 1
IndexType dstIndex =
- indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1;
+ indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1;
if (dstIndex < dstAddDimSize) {
IndexType dstOffset =
- IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst);
+ IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst);
dstOffset += dstIndex * dst.strides[dstAddDim];
IndexType srcOffset =
- IndexToOffset<IndexType, SrcDim>::get(elementInSlice, src);
+ IndexToOffset<float, IndexType, SrcDim>::get(elementInSlice, src);
srcOffset += srcIndex * src.strides[srcAddDim];
atomicAdd(&dst.data[dstOffset], src.data[srcOffset]);
@@ -339,16 +345,16 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe
int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
-#define SMALL_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
- indexAddSmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
- <<<smallIndexGrid, smallIndexBlock, 0, stream>>>( \
- dstInfo, srcInfo, indicesInfo, \
+#define SMALL_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+ indexAddSmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
+ <<<smallIndexGrid, smallIndexBlock, 0, stream>>>( \
+ dstInfo, srcInfo, indicesInfo, \
dstAddDim, srcAddDim, sliceSize, dstAddDimSize);
-#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
- indexAddLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
- <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \
- dstInfo, srcInfo, indicesInfo, \
+#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+ indexAddLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
+ <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \
+ dstInfo, srcInfo, indicesInfo, \
dstAddDim, srcAddDim, sliceSize, dstAddDimSize);
dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8)));
@@ -357,18 +363,21 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe
dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, 128L), (long)(mpc * 8)));
dim3 largeIndexBlock(std::min(srcTotalSize, 128L));
- if (THC_canUse32BitIndexMath(state, dst) &&
- THC_canUse32BitIndexMath(state, src) &&
- THC_canUse32BitIndexMath(state, indices)) {
- TensorInfo<unsigned int> dstInfo(state, dst);
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) {
+ TensorInfo<float, unsigned int> dstInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, dst);
int dstAddDim = dstInfo.collapseDims(dim);
- dstInfo.sizes[dstAddDim] = 1;
+ dstInfo.reduceDim(dstAddDim);
- TensorInfo<unsigned int> srcInfo(state, src);
+ TensorInfo<float, unsigned int> srcInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, src);
int srcAddDim = srcInfo.collapseDims(dim);
- srcInfo.sizes[srcAddDim] = 1;
+ srcInfo.reduceDim(srcAddDim);
- TensorInfo<unsigned int> indicesInfo(state, indices);
+ TensorInfo<float, unsigned int> indicesInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, indices);
indicesInfo.collapseDims();
// A reasonable choice for when to have each thread iterate over
@@ -395,15 +404,18 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe
}
}
} else {
- TensorInfo<unsigned long> dstInfo(state, dst);
+ TensorInfo<float, unsigned long> dstInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, dst);
int dstAddDim = dstInfo.collapseDims(dim);
- dstInfo.sizes[dstAddDim] = 1;
+ dstInfo.reduceDim(dstAddDim);
- TensorInfo<unsigned long> srcInfo(state, src);
+ TensorInfo<float, unsigned long> srcInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, src);
int srcAddDim = srcInfo.collapseDims(dim);
- srcInfo.sizes[srcAddDim] = 1;
+ srcInfo.reduceDim(srcAddDim);
- TensorInfo<unsigned long> indicesInfo(state, indices);
+ TensorInfo<float, unsigned long> indicesInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, indices);
indicesInfo.collapseDims();
LARGE_INDEX(unsigned long, -1, -1, -1);
@@ -420,12 +432,12 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe
// indexFillLargeIndex kernel is a better choice to increase
// parallelism.
template <typename IndexType, int DstDim, int IdxDim>
-__global__ void indexFillSmallIndex(TensorInfo<IndexType> dst,
- TensorInfo<IndexType> indices,
- int dstFillDim,
- IndexType innerSize,
- long dstFillDimSize,
- float val) {
+__global__ void indexFillSmallIndex(TensorInfo<float, IndexType> dst,
+ TensorInfo<float, IndexType> indices,
+ int dstFillDim,
+ IndexType innerSize,
+ long dstFillDimSize,
+ float val) {
// In order to avoid reloading the index that we are copying, load
// it once to handle all of the points that are being selected, so
// it can be reused as much as possible. This kernel is chosen when
@@ -434,7 +446,7 @@ __global__ void indexFillSmallIndex(TensorInfo<IndexType> dst,
for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
// Lua indices begin at 1
IndexType dstIndex_ =
- indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1;
+ indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1;
if (dstIndex < dstFillDimSize) {
// We stride over the output ignoring the indexed dimension
@@ -443,7 +455,7 @@ __global__ void indexFillSmallIndex(TensorInfo<IndexType> dst,
linearIndex < innerSize;
linearIndex += gridDim.x * blockDim.x) {
IndexType dstOffset =
- IndexToOffset<IndexType, DstDim>::get(linearIndex, dst);
+ IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst);
dstOffset += dstIndex_ * dst.strides[dstFillDim];
dst.data[dstOffset] = val;
@@ -459,27 +471,27 @@ __global__ void indexFillSmallIndex(TensorInfo<IndexType> dst,
// indexFillSmallIndex kernel is a better choice to reduce memory
// accesses.
template <typename IndexType, int DstDim, int IdxDim>
-__global__ void indexFillLargeIndex(TensorInfo<IndexType> dst,
- TensorInfo<IndexType> indices,
- int dstFillDim,
- IndexType innerSize,
- long dstFillDimSize,
- float val) {
+__global__ void indexFillLargeIndex(TensorInfo<float, IndexType> dst,
+ TensorInfo<float, IndexType> indices,
+ int dstFillDim,
+ IndexType innerSize,
+ long dstFillDimSize,
+ float val) {
// We stride over the output including the indexed dimension
// (totalSize), and calculate the destination index point based on that
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
- linearIndex < innerSize * indices.sizes[0];
+ linearIndex < innerSize * indices.sizes[0];
linearIndex += gridDim.x * blockDim.x) {
IndexType dstIndex = linearIndex / innerSize;
IndexType elementInSlice = linearIndex % innerSize;
// Lua indices begin at 1
IndexType dstIndex_ =
- indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1;
+ indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1;
if (dstIndex_ < dstFillDimSize) {
IndexType dstOffset =
- IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst);
+ IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst);
dstOffset += dstIndex_ * dst.strides[dstFillDim];
dst.data[dstOffset] = val;
@@ -528,16 +540,16 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT
int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
-#define SMALL_INDEX(TYPE, DST_DIM, IDX_DIM) \
- indexFillSmallIndex<TYPE, DST_DIM, IDX_DIM> \
- <<<smallIndexGrid, smallIndexBlock, 0, stream>>>( \
- dstInfo, indicesInfo, \
+#define SMALL_INDEX(TYPE, DST_DIM, IDX_DIM) \
+ indexFillSmallIndex<TYPE, DST_DIM, IDX_DIM> \
+ <<<smallIndexGrid, smallIndexBlock, 0, stream>>>( \
+ dstInfo, indicesInfo, \
dstFillDim, sliceSize, dstFillDimSize, val);
-#define LARGE_INDEX(TYPE, DST_DIM, IDX_DIM) \
- indexFillLargeIndex<TYPE, DST_DIM, IDX_DIM> \
- <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \
- dstInfo, indicesInfo, \
+#define LARGE_INDEX(TYPE, DST_DIM, IDX_DIM) \
+ indexFillLargeIndex<TYPE, DST_DIM, IDX_DIM> \
+ <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \
+ dstInfo, indicesInfo, \
dstFillDim, sliceSize, dstFillDimSize, val);
dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8)));
@@ -546,13 +558,15 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT
dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, 128L), (long)(mpc * 8)));
dim3 largeIndexBlock(std::min(dstTotalSize, 128L));
- if (THC_canUse32BitIndexMath(state, dst) &&
- THC_canUse32BitIndexMath(state, indices)) {
- TensorInfo<unsigned int> dstInfo(state, dst);
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) {
+ TensorInfo<float, unsigned int> dstInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, dst);
int dstFillDim = dstInfo.collapseDims(dim);
- dstInfo.sizes[dstFillDim] = 1;
+ dstInfo.reduceDim(dstFillDim);
- TensorInfo<unsigned int> indicesInfo(state, indices);
+ TensorInfo<float, unsigned int> indicesInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, indices);
indicesInfo.collapseDims();
// A reasonable choice for when to have each thread iterate over
@@ -579,11 +593,13 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT
}
}
} else {
- TensorInfo<unsigned long> dstInfo(state, dst);
+ TensorInfo<float, unsigned long> dstInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, dst);
int dstFillDim = dstInfo.collapseDims(dim);
- dstInfo.sizes[dstFillDim] = 1;
+ dstInfo.reduceDim(dstFillDim);
- TensorInfo<unsigned long> indicesInfo(state, indices);
+ TensorInfo<float, unsigned long> indicesInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, indices);
indicesInfo.collapseDims();
LARGE_INDEX(unsigned long, -1, -1);
@@ -600,9 +616,9 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT
// indexSelectLargeIndex kernel is a better choice to increase
// parallelism.
template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst,
- TensorInfo<IndexType> src,
- TensorInfo<IndexType> indices,
+__global__ void indexSelectSmallIndex(TensorInfo<float, IndexType> dst,
+ TensorInfo<float, IndexType> src,
+ TensorInfo<float, IndexType> indices,
int dstSelectDim,
int srcSelectDim,
IndexType innerSize,
@@ -615,7 +631,7 @@ __global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst,
for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
// Lua indices begin at 1
IndexType srcIndex =
- indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1;
+ indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1;
if (srcIndex < srcSelectDimSize) {
// We stride over the output ignoring the indexed dimension
@@ -624,11 +640,11 @@ __global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst,
linearIndex < innerSize;
linearIndex += gridDim.x * blockDim.x) {
IndexType dstOffset =
- IndexToOffset<IndexType, DstDim>::get(linearIndex, dst);
+ IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst);
dstOffset += dstIndex * dst.strides[dstSelectDim];
IndexType srcOffset =
- IndexToOffset<IndexType, SrcDim>::get(linearIndex, src);
+ IndexToOffset<float, IndexType, SrcDim>::get(linearIndex, src);
srcOffset += srcIndex * src.strides[srcSelectDim];
dst.data[dstOffset] = src.data[srcOffset];
@@ -644,9 +660,9 @@ __global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst,
// indexSelectSmallIndex kernel is a better choice to reduce memory
// accesses.
template <typename IndexType, int DstDim, int SrcDim, int IdxDim>
-__global__ void indexSelectLargeIndex(TensorInfo<IndexType> dst,
- TensorInfo<IndexType> src,
- TensorInfo<IndexType> indices,
+__global__ void indexSelectLargeIndex(TensorInfo<float, IndexType> dst,
+ TensorInfo<float, IndexType> src,
+ TensorInfo<float, IndexType> indices,
int dstSelectDim,
int srcSelectDim,
IndexType totalSize,
@@ -662,15 +678,15 @@ __global__ void indexSelectLargeIndex(TensorInfo<IndexType> dst,
// Lua indices begin at 1
IndexType srcIndex =
- indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1;
+ indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1;
if (srcIndex < srcSelectDimSize) {
IndexType dstOffset =
- IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst);
+ IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst);
dstOffset += dstIndex * dst.strides[dstSelectDim];
IndexType srcOffset =
- IndexToOffset<IndexType, SrcDim>::get(elementInSlice, src);
+ IndexToOffset<float, IndexType, SrcDim>::get(elementInSlice, src);
srcOffset += srcIndex * src.strides[srcSelectDim];
dst.data[dstOffset] = src.data[srcOffset];
@@ -734,10 +750,10 @@ void THCudaTensor_indexSelect(THCState *state, THCudaTensor *dst, THCudaTensor *
dstInfo, srcInfo, indicesInfo, \
dstSelectDim, srcSelectDim, sliceSize, srcSelectDimSize);
-#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
- indexSelectLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
- <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \
- dstInfo, srcInfo, indicesInfo, \
+#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \
+ indexSelectLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \
+ <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \
+ dstInfo, srcInfo, indicesInfo, \
dstSelectDim, srcSelectDim, dstTotalSize, sliceSize, srcSelectDimSize);
dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8)));
@@ -746,18 +762,21 @@ void THCudaTensor_indexSelect(THCState *state, THCudaTensor *dst, THCudaTensor *
dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, 128L), (long)(mpc * 8)));
dim3 largeIndexBlock(std::min(dstTotalSize, 128L));
- if (THC_canUse32BitIndexMath(state, dst) &&
- THC_canUse32BitIndexMath(state, src) &&
- THC_canUse32BitIndexMath(state, indices)) {
- TensorInfo<unsigned int> dstInfo(state, dst);
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) {
+ TensorInfo<float, unsigned int> dstInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, dst);
int dstSelectDim = dstInfo.collapseDims(dim);
- dstInfo.sizes[dstSelectDim] = 1;
+ dstInfo.reduceDim(dstSelectDim);
- TensorInfo<unsigned int> srcInfo(state, src);
+ TensorInfo<float, unsigned int> srcInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, src);
int srcSelectDim = srcInfo.collapseDims(dim);
- srcInfo.sizes[srcSelectDim] = 1;
+ srcInfo.reduceDim(srcSelectDim);
- TensorInfo<unsigned int> indicesInfo(state, indices);
+ TensorInfo<float, unsigned int> indicesInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, indices);
indicesInfo.collapseDims();
// A reasonable choice for when to have each thread iterate over
@@ -784,15 +803,18 @@ void THCudaTensor_indexSelect(THCState *state, THCudaTensor *dst, THCudaTensor *
}
}
} else {
- TensorInfo<unsigned long> dstInfo(state, dst);
+ TensorInfo<float, unsigned long> dstInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, dst);
int dstSelectDim = dstInfo.collapseDims(dim);
- dstInfo.sizes[dstSelectDim] = 1;
+ dstInfo.reduceDim(dstSelectDim);
- TensorInfo<unsigned long> srcInfo(state, src);
+ TensorInfo<float, unsigned long> srcInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, src);
int srcSelectDim = srcInfo.collapseDims(dim);
- srcInfo.sizes[srcSelectDim] = 1;
+ srcInfo.reduceDim(srcSelectDim);
- TensorInfo<unsigned long> indicesInfo(state, indices);
+ TensorInfo<float, unsigned long> indicesInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, indices);
indicesInfo.collapseDims();
LARGE_INDEX(unsigned long, -1, -1, -1);
diff --git a/lib/THC/THCTensorInfo.cuh b/lib/THC/THCTensorInfo.cuh
new file mode 100644
index 0000000..5347116
--- /dev/null
+++ b/lib/THC/THCTensorInfo.cuh
@@ -0,0 +1,279 @@
+#ifndef THC_TENSOR_INFO_INC
+#define THC_TENSOR_INFO_INC
+
+#include <cuda.h>
+#include <assert.h>
+#include "THCGeneral.h"
+#include "THCTensor.h"
+
+// Maximum number of dimensions allowed for cutorch
+#define MAX_CUTORCH_DIMS 25
+
+// Warning string for tensor arguments that are too large or have too
+// many dimensions
+#define CUTORCH_STR(X) #X
+#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
+ CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
+
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+ TensorInfo(T* p,
+ int dim,
+ IndexType sz[MAX_CUTORCH_DIMS],
+ IndexType st[MAX_CUTORCH_DIMS]);
+
+ // Set the size of the given dimension to 1, as if it were a
+ // reduction dim (allows you to calculate offsets of the reduction
+ // slice)
+ void reduceDim(int dim);
+
+ // Collapses all runs of successive dimensions if the size/strides
+ // match up within the run and there are no holes between the
+ // dimensions.
+ // If excludeDim is set (not -1), then excludeDim will not be
+ // collapsed with any other dimension.
+ // Function returns the new dimension index that excludeDim maps to,
+ // since the collapsed dimensions are <= the input dimensions.
+ int collapseDims(int excludeDim = -1);
+
+ // Contiguous tensors of more than one dimension are collapsed down
+ // to one tensor
+ __host__ __device__ inline bool isContiguous() const {
+ return (dims == 1 && strides[0] == 1);
+ }
+
+ T* data;
+ IndexType sizes[MAX_CUTORCH_DIMS];
+ IndexType strides[MAX_CUTORCH_DIMS];
+ int dims;
+};
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+ int dim,
+ IndexType sz[MAX_CUTORCH_DIMS],
+ IndexType st[MAX_CUTORCH_DIMS]) {
+ data = p;
+ dims = dim;
+ assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
+
+ for (int i = 0; i < dim; ++i) {
+ sizes[i] = sz[i];
+ strides[i] = st[i];
+ }
+}
+
+template <typename T, typename IndexType>
+void
+TensorInfo<T, IndexType>::reduceDim(int dim) {
+ assert(dim < dims && dim >= 0);
+ sizes[dim] = 1;
+}
+
+template <typename T, typename IndexType>
+int
+TensorInfo<T, IndexType>::collapseDims(int excludeDim) {
+ // Find the innermost dimension not of size 1, since dimensions of size 1 are
+ // collapsible.
+ int firstNonOneDim = -1;
+
+ for (int i = dims - 1; i >= 0; --i) {
+ if (i == excludeDim) {
+ // We cannot collapse this dimension, even if it is size 1
+ firstNonOneDim = i;
+ break;
+ }
+
+ if (sizes[i] != 1) {
+ firstNonOneDim = i;
+ break;
+ }
+ }
+
+ // Special case: if all dimensions are of size 1, then this is a
+ // single-point tensor that we still have to operate on. Reduce to a
+ // single point.
+ if (firstNonOneDim == -1) {
+ assert(excludeDim == -1);
+
+ dims = 1;
+ sizes[0] = 1;
+ strides[0] = 1;
+
+ // Everything effectively got collapsed into this dimension
+ return 0;
+ }
+
+ // Count the number of successive dimensions that can be collapsed, from
+ // innermost to outermost.
+ int numCollapsed = 0;
+
+ // Skip the leading size 1 dims
+ numCollapsed += dims - 1 - firstNonOneDim;
+
+ // We perform one pass through to determine how many dimensions we
+ // can collapse, before calculating the actual size of the collapsed
+ // dimensions.
+ // size/strideInner are the size/strides of the previous inner
+ // non-collapsible dim we encounter.
+ long sizeInner = sizes[firstNonOneDim];
+ long strideInner = strides[firstNonOneDim];
+
+ for (int i = firstNonOneDim - 1; i >= 0; --i) {
+ long sizeOuter = sizes[i];
+ long strideOuter = strides[i];
+
+ // Don't collapse this dimension if we want to exclude it from
+ // collapsing.
+ // Since this code is attempting to collapse a subsequent
+ // dimension (i) with the preceding dimension (i + 1), we can only
+ // perform collapsing if the preceding dimension can be collapsed
+ // (i.e., not excludeDim)
+ if ((excludeDim != i) && (excludeDim != i + 1)) {
+ // The next outermost dimension can be skipped if size 1
+ if (sizeOuter == 1) {
+ ++numCollapsed;
+ continue;
+ }
+
+ // If the next outermost dimension is contiguous with the
+ // previous non-collapsed one, collapse it
+ if (strideOuter == strideInner * sizeInner) {
+ ++numCollapsed;
+
+ // This is the run of collapsed dimensions' size
+ sizeInner = sizeInner * sizeOuter;
+ continue;
+ }
+ }
+
+ // Otherwise, this new outer dimension at `i` cannot be collapsed
+ // because it is excluded from collapsing, or it is not contiguous
+ // with the previous inner dimension.
+ sizeInner = sizeOuter;
+ strideInner = strideOuter;
+ }
+
+ // This will be our new size/stride and dimension.
+ IndexType newSizes[MAX_CUTORCH_DIMS];
+ IndexType newStrides[MAX_CUTORCH_DIMS];
+
+ assert(numCollapsed < dims);
+ int newDims = dims - numCollapsed;
+
+ // We return the index of the excluded dimension that is excluded
+ // from being collapsed here.
+ int returnDim = -1;
+
+ // We perform a second pass through the dimensions to actually
+ // calculate the size of the collapsed dimensions.
+ int collapsedIndex = dims - numCollapsed - 1;
+ newSizes[collapsedIndex] = sizes[firstNonOneDim];
+ newStrides[collapsedIndex] = strides[firstNonOneDim];
+
+ if (firstNonOneDim == excludeDim) {
+ returnDim = collapsedIndex;
+ }
+
+ for (int i = firstNonOneDim - 1; i >= 0; --i) {
+ IndexType sizeOuter = sizes[i];
+ IndexType strideOuter = strides[i];
+
+ if ((excludeDim != i) && (excludeDim != i + 1)) {
+ if (sizeOuter == 1) {
+ // skip
+ continue;
+ }
+
+ if (strideOuter == newSizes[collapsedIndex] * newStrides[collapsedIndex]) {
+ // collapse
+ newSizes[collapsedIndex] *= sizeOuter;
+ continue;
+ }
+ }
+
+ // Otherwise, strides don't match, or dim `i` is excluded from
+ // collapsing.
+ --collapsedIndex;
+ assert(collapsedIndex >= 0);
+ assert(collapsedIndex < newDims);
+ newSizes[collapsedIndex] = sizeOuter;
+ newStrides[collapsedIndex] = strideOuter;
+
+ if (excludeDim == i) {
+ returnDim = collapsedIndex;
+ }
+ }
+
+ // We must have filled all the dimensions we're looking for
+ assert(collapsedIndex == 0);
+ assert((excludeDim == -1) || (returnDim != -1));
+
+ dims = newDims;
+
+ for (int i = 0; i < dims; ++i) {
+ sizes[i] = newSizes[i];
+ strides[i] = newStrides[i];
+ }
+
+ // After collapsing, the original `excludeDim` may have been
+ // renumbered to this new `returnDim`, since some dimensions could
+ // have been collapsed.
+ return returnDim;
+}
+
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+ static __host__ __device__ IndexType get(
+ IndexType linearId,
+ const TensorInfo<T, IndexType>& info) {
+ IndexType offset = 0;
+
+ // Use static dims
+ for (int i = Dims - 1; i >= 0; --i) {
+ IndexType curDimIndex = linearId % info.sizes[i];
+ IndexType curDimOffset = curDimIndex * info.strides[i];
+ offset += curDimOffset;
+
+ if (i > 0) {
+ linearId /= info.sizes[i];
+ }
+ }
+
+ return offset;
+ }
+};
+
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -2> {
+ static inline __host__ __device__ IndexType
+ get(IndexType linearId, const TensorInfo<T, IndexType>& info) {
+ return linearId;
+ }
+};
+
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+ static inline __host__ __device__ IndexType get(
+ IndexType linearId,
+ const TensorInfo<T, IndexType>& info) {
+
+ IndexType offset = 0;
+
+ // Use dynamic dims
+ for (int i = info.dims - 1; i >= 0; --i) {
+ IndexType curDimIndex = linearId % info.sizes[i];
+ IndexType curDimOffset = curDimIndex * info.strides[i];
+ offset += curDimOffset;
+
+ linearId /= info.sizes[i];
+ }
+
+ return offset;
+ }
+};
+
+#endif // THC_TENSOR_INFO_INC
diff --git a/lib/THC/THCTensorMasked.cu b/lib/THC/THCTensorMasked.cu
index ba69425..6ee5bad 100644
--- a/lib/THC/THCTensorMasked.cu
+++ b/lib/THC/THCTensorMasked.cu
@@ -33,7 +33,7 @@ void THCudaTensor_maskedFill(THCState* state,
THCudaTensor_nElement(state, mask),
2, "sizes do not match");
- if (!THCudaTensor_pointwiseApply2(state, tensor, mask, TensorMaskedFillOp(value))) {
+ if (!THC_pointwiseApply2(state, tensor, mask, TensorMaskedFillOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
@@ -106,7 +106,7 @@ void THCudaTensor_maskedCopy(THCState* state,
// update `tensor` where `mask` == 1 but pull from `src` at
// maskPrefixSum
- bool status = THCudaTensor_pointwiseApply3(
+ bool status = THC_pointwiseApply3(
state, tensor, contigMask, maskPrefixSum,
TensorMaskedCopyOp(THCudaTensor_data(state, contigSrc)));
@@ -169,7 +169,7 @@ void THCudaTensor_maskedSelect(THCState* state,
maskPrefixSumData);
// Then copy over the masked elements at their desired output index
- bool status = THCudaTensor_pointwiseApply3(
+ bool status = THC_pointwiseApply3(
state, contigMask, maskPrefixSum,
src, TensorMaskedSelectOp(THCudaTensor_data(state, tensor)));
diff --git a/lib/THC/THCTensorMath.cu b/lib/THC/THCTensorMath.cu
index 0690edf..5e140ef 100644
--- a/lib/THC/THCTensorMath.cu
+++ b/lib/THC/THCTensorMath.cu
@@ -9,66 +9,6 @@
#include <thrust/functional.h>
#include <cfloat>
-struct TensorFillOp {
- TensorFillOp(float v) : val(v) {}
- __device__ __forceinline__ void operator()(float* v) { *v = val; }
-
- const float val;
-};
-
-void THCudaTensor_fill(THCState* state, THCudaTensor *self_, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 1, self_));
- if (!THCudaTensor_pointwiseApply1(state, self_, TensorFillOp(value))) {
- THArgCheck(false, 1, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
-void THCudaTensor_zero(THCState *state, THCudaTensor *self_)
-{
- THAssert(THCudaTensor_checkGPU(state, 1, self_));
- if (THCudaTensor_isContiguous(state, self_)) {
- THCudaCheck(cudaMemsetAsync(THCudaTensor_data(state, self_),
- 0,
- sizeof(float) * THCudaTensor_nElement(state, self_),
- THCState_getCurrentStream(state)));
- } else {
- if (!THCudaTensor_pointwiseApply1(state, self_, TensorFillOp(0))) {
- THArgCheck(false, 1, CUTORCH_DIM_WARNING);
- }
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
-void THCudaTensor_zeros(THCState *state, THCudaTensor *r_, THLongStorage *size)
-{
- THAssert(THCudaTensor_checkGPU(state, 1, r_));
- THCudaTensor_resize(state, r_, size, NULL);
- THCudaTensor_zero(state, r_);
-}
-
-void THCudaTensor_ones(THCState *state, THCudaTensor *r_, THLongStorage *size)
-{
- THAssert(THCudaTensor_checkGPU(state, 1, r_));
- THCudaTensor_resize(state, r_, size, NULL);
- THCudaTensor_fill(state, r_, 1);
-}
-
-void THCudaTensor_reshape(THCState *state, THCudaTensor *r_, THCudaTensor *t, THLongStorage *size)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, r_, t));
- THCudaTensor_resize(state, r_, size, NULL);
- THCudaTensor_copy(state, r_, t);
-}
-
-long THCudaTensor_numel(THCState *state, THCudaTensor *t)
-{
- return THCudaTensor_nElement(state, t);
-}
-
void THCudaTensor_cat(THCState *state, THCudaTensor *result, THCudaTensor *ta, THCudaTensor *tb, int dimension)
{
THCudaTensor* inputs[2];
@@ -138,74 +78,6 @@ void THCudaTensor_catArray(THCState *state, THCudaTensor *result, THCudaTensor *
}
}
-struct TensorCPowOp {
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out = powf(*out, *in);
- }
-
- __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
- *out = powf(*in1, *in2);
- }
-};
-
-void THCudaTensor_cpow(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THArgCheck(THCudaTensor_nElement(state, src1) ==
- THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
- if (self_ == src1) {
- // self = pow(self, src2)
- if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorCPowOp())) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- } else {
- THCudaTensor_resizeAs(state, self_, src1);
-
- // self = pow(src1, src2)
- if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorCPowOp())) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
-struct TensorDivOp {
- __device__ __forceinline__ void
- operator()(float* out, float* in) {
- *out /= *in;
- }
-
- __device__ __forceinline__ void
- operator()(float* out, float* in1, float* in2) {
- *out = *in1 / *in2;
- }
-};
-
-void THCudaTensor_cdiv(THCState* state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THArgCheck(THCudaTensor_nElement(state, src1) ==
- THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
- if (self_ == src1) {
- // self *= src2
- if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorDivOp())) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- } else {
- THCudaTensor_resizeAs(state, self_, src1);
-
- // self = src1 * src2
- if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorDivOp())) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
struct TensorAddCMulOp {
TensorAddCMulOp(float v) : val(v) {}
@@ -234,7 +106,7 @@ void THCudaTensor_addcmul(THCState *state, THCudaTensor *self_, THCudaTensor *t,
THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2),
3, "sizes do not match");
- if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorAddCMulOp(value))) {
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddCMulOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
@@ -268,7 +140,7 @@ void THCudaTensor_addcdiv(THCState *state, THCudaTensor *self_, THCudaTensor *t,
THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2),
3, "sizes do not match");
- if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorAddCDivOp(value))) {
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddCDivOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
@@ -400,3 +272,14 @@ int THCudaTensor_logicalany(THCState *state, THCudaTensor *self) {
return (int) result;
}
+
+template <typename T>
+struct TensorFillOp {
+ TensorFillOp(T v) : val(v) {}
+ __device__ __forceinline__ void operator()(T* v) { *v = val; }
+
+ const T val;
+};
+
+#include "generic/THCTensorMath.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h
index e31cb2a..6593da2 100644
--- a/lib/THC/THCTensorMath.h
+++ b/lib/THC/THCTensorMath.h
@@ -4,28 +4,19 @@
#include "THCTensor.h"
#include "THCGeneral.h"
-THC_API void THCudaTensor_fill(THCState *state, THCudaTensor *self, float value);
-THC_API void THCudaTensor_zero(THCState *state, THCudaTensor *self);
+#include "generic/THCTensorMath.h"
+#include "THCGenerateAllTypes.h"
-THC_API void THCudaTensor_zeros(THCState *state, THCudaTensor *r_, THLongStorage *size);
-THC_API void THCudaTensor_ones(THCState *state, THCudaTensor *r_, THLongStorage *size);
-THC_API void THCudaTensor_reshape(THCState *state, THCudaTensor *r_, THCudaTensor *t, THLongStorage *size);
-THC_API long THCudaTensor_numel(THCState *state, THCudaTensor *t);
+#include "generic/THCTensorMathPairwise.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathPointwise.h"
+#include "THCGenerateAllTypes.h"
-THC_API void THCudaTensor_add(THCState *state, THCudaTensor *self, THCudaTensor *src, float value);
-THC_API void THCudaTensor_sub(THCState *state, THCudaTensor *self, THCudaTensor *src, float value);
-THC_API void THCudaTensor_mul(THCState *state, THCudaTensor *self, THCudaTensor *src, float value);
-THC_API void THCudaTensor_div(THCState *state, THCudaTensor *self, THCudaTensor *src, float value);
THC_API void THCudaTensor_tril(THCState *state, THCudaTensor *self, THCudaTensor *src, long k);
THC_API void THCudaTensor_triu(THCState *state, THCudaTensor *self, THCudaTensor *src, long k);
-THC_API void THCudaTensor_cadd(THCState *state, THCudaTensor *self, THCudaTensor *src1, float value, THCudaTensor *src2);
-THC_API void THCudaTensor_csub(THCState *state, THCudaTensor *self, THCudaTensor *src1, float value, THCudaTensor *src2);
-THC_API void THCudaTensor_cmul(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor *src2);
-THC_API void THCudaTensor_cpow(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor *src2);
-THC_API void THCudaTensor_cdiv(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor *src2);
-
THC_API void THCudaTensor_addcmul(THCState *state, THCudaTensor *self, THCudaTensor* t, float value, THCudaTensor *src1, THCudaTensor *src2);
THC_API void THCudaTensor_addcdiv(THCState *state, THCudaTensor *self, THCudaTensor* t, float value, THCudaTensor *src1, THCudaTensor *src2);
diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu
index 2cc2adb..0d6bb52 100644
--- a/lib/THC/THCTensorMath2.cu
+++ b/lib/THC/THCTensorMath2.cu
@@ -31,13 +31,13 @@ void THCudaTensor_pow(THCState *state, THCudaTensor *self_, THCudaTensor *src, f
{
THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
if (self_ == src) {
- if (!THCudaTensor_pointwiseApply1(state, self_, TensorPowOp(value))) {
+ if (!THC_pointwiseApply1(state, self_, TensorPowOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self_, src);
- if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorPowOp(value))) {
+ if (!THC_pointwiseApply2(state, self_, src, TensorPowOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
@@ -63,13 +63,13 @@ void THCudaTensor_tpow(THCState *state, THCudaTensor *self_, float value, THCuda
{
THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
if (self_ == src) {
- if (!THCudaTensor_pointwiseApply1(state, self_, TensorTPowOp(value))) {
+ if (!THC_pointwiseApply1(state, self_, TensorTPowOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self_, src);
- if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorTPowOp(value))) {
+ if (!THC_pointwiseApply2(state, self_, src, TensorTPowOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
@@ -90,7 +90,7 @@ void THCudaTensor_atan2(THCState *state, THCudaTensor *self_, THCudaTensor *tx,
THCudaTensor_nElement(state, ty), 3, "sizes do not match");
THCudaTensor_resizeAs(state, self_, tx);
- if (!THCudaTensor_pointwiseApply3(state, self_, tx, ty, TensorATan2Op())) {
+ if (!THC_pointwiseApply3(state, self_, tx, ty, TensorATan2Op())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
@@ -116,13 +116,13 @@ void THCudaTensor_clamp(THCState *state, THCudaTensor *self_, THCudaTensor *src,
{
THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
if (self_ == src) {
- if (!THCudaTensor_pointwiseApply1(state, self_, TensorClampOp(min_value, max_value))) {
+ if (!THC_pointwiseApply1(state, self_, TensorClampOp(min_value, max_value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self_, src);
- if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorClampOp(min_value, max_value))) {
+ if (!THC_pointwiseApply2(state, self_, src, TensorClampOp(min_value, max_value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
@@ -146,13 +146,13 @@ void THCudaTensor_sign(THCState *state, THCudaTensor *self_, THCudaTensor *src)
{
THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
if (self_ == src) {
- if (!THCudaTensor_pointwiseApply1(state, self_, TensorSignOp())) {
+ if (!THC_pointwiseApply1(state, self_, TensorSignOp())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self_, src);
- if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorSignOp())) {
+ if (!THC_pointwiseApply2(state, self_, src, TensorSignOp())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
@@ -192,7 +192,7 @@ void THCudaTensor_lerp(THCState *state, THCudaTensor *result, THCudaTensor *a, T
THCudaTensor_nElement(state, b), 3, "sizes do not match");
THCudaTensor_resizeAs(state, result, a);
- if (!THCudaTensor_pointwiseApply3(state, result, a, b, TensorLerpOp(w))) {
+ if (!THC_pointwiseApply3(state, result, a, b, TensorLerpOp(w))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
@@ -699,7 +699,7 @@ THC_API void THCudaTensor_cross(THCState *state, THCudaTensor *self, THCudaTenso
THCudaTensor *nx = THCudaTensor_newNarrow(state, x, dimension, 0, 1);
THCudaTensor *ny = THCudaTensor_newNarrow(state, y, dimension, 0, 1);
THCudaTensor *nself = THCudaTensor_newNarrow(state, self, dimension, 0, 1);
- if (!THCudaTensor_pointwiseApply3(state, nself, nx, ny, TensorCrossOp(sx, sy, so))) {
+ if (!THC_pointwiseApply3(state, nself, nx, ny, TensorCrossOp(sx, sy, so))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
THCudaTensor_free(state, nx);
diff --git a/lib/THC/THCTensorMathCompare.cu b/lib/THC/THCTensorMathCompare.cu
index 0f7d979..71cfdd5 100644
--- a/lib/THC/THCTensorMathCompare.cu
+++ b/lib/THC/THCTensorMathCompare.cu
@@ -11,7 +11,7 @@ void THCudaTensor_logicalValue(THCState *state, THCudaTensor *self_, THCudaTenso
{
THCudaTensor_resizeAs(state, self_, src);
- if (!THCudaTensor_pointwiseApply2(state, self_, src, op)) {
+ if (!THC_pointwiseApply2(state, self_, src, op)) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
diff --git a/lib/THC/THCTensorMathCompareT.cu b/lib/THC/THCTensorMathCompareT.cu
index b88f153..0567a89 100644
--- a/lib/THC/THCTensorMathCompareT.cu
+++ b/lib/THC/THCTensorMathCompareT.cu
@@ -11,7 +11,7 @@ void THCudaTensor_logicalTensor(THCState *state, THCudaTensor *self_, THCudaTens
THCudaTensor_resizeAs(state, self_, src1);
THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2), 3, "sizes do not match");
- if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, op)) {
+ if (!THC_pointwiseApply3(state, self_, src1, src2, op)) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
index 383344b..2c081d1 100644
--- a/lib/THC/THCTensorMathPairwise.cu
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -1,96 +1,98 @@
#include "THCTensorMath.h"
#include "THCGeneral.h"
#include "THCBlas.h"
+#include "THCHalf.h"
#include "THCTensorCopy.h"
#include "THCApply.cuh"
#include "THCReduce.cuh"
+template <typename T>
struct TensorAddConstantOp {
- TensorAddConstantOp(float v) : val(v) {}
- __device__ __forceinline__ void operator()(float* out, float* in) {
+ TensorAddConstantOp(T v) : val(v) {}
+ __device__ __forceinline__ void operator()(T* out, T* in) {
*out = *in + val;
}
- __device__ __forceinline__ void operator()(float* v) {
+ __device__ __forceinline__ void operator()(T* v) {
*v += val;
}
- const float val;
+ const T val;
};
-void THCudaTensor_add(THCState *state, THCudaTensor *self_, THCudaTensor *src_, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
- if (self_ == src_) {
- if (!THCudaTensor_pointwiseApply1(state, self_, TensorAddConstantOp(value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- } else {
- THCudaTensor_resizeAs(state, self_, src_);
-
- if (!THCudaTensor_pointwiseApply2(state, self_, src_, TensorAddConstantOp(value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorAddConstantOp<half> {
+ TensorAddConstantOp(half v) : val(v) {}
+ __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *out = __hadd(*in, val);
+#else
+ float fin = __half2float(*in);
+ float fval = __half2float(val);
+ float fout = fin + fval;
+ *out = __float2half(fout);
+#endif
}
- THCudaCheck(cudaGetLastError());
-}
+ __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *v = __hadd(*v, val);
+#else
+ float fv = __half2float(*v);
+ float fval = __half2float(val);
+ fv += fval;
+ *v = __float2half(fv);
+#endif
+ }
-void THCudaTensor_sub(THCState *state, THCudaTensor *self_, THCudaTensor *src_, float value)
-{
- THCudaTensor_add(state, self_, src_, -value);
-}
+ const half val;
+};
+#endif // CUDA_HALF_TENSOR
+template <typename T>
struct TensorMulConstantOp {
- TensorMulConstantOp(float v) : val(v) {}
- __device__ __forceinline__ void operator()(float* out, float* in) {
+ TensorMulConstantOp(T v) : val(v) {}
+ __device__ __forceinline__ void operator()(T* out, T* in) {
*out = *in * val;
}
- __device__ __forceinline__ void operator()(float* v) {
+ __device__ __forceinline__ void operator()(T* v) {
*v *= val;
}
- const float val;
+ const T val;
};
-void THCudaTensor_mul(THCState *state, THCudaTensor *self_, THCudaTensor *src_, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
- if (self_ == src_) {
- if (!THCudaTensor_pointwiseApply1(state, self_, TensorMulConstantOp(value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- } else {
- THCudaTensor_resizeAs(state, self_, src_);
-
- if (!THCudaTensor_pointwiseApply2(state, self_, src_, TensorMulConstantOp(value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorMulConstantOp<half> {
+ TensorMulConstantOp(half v) : val(v) {}
+ __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *out = __hmul(*in, val);
+#else
+ float fin = __half2float(*in);
+ float fval = __half2float(val);
+ float fout = fin * fval;
+ *out = __float2half(fout);
+#endif
}
- THCudaCheck(cudaGetLastError());
-}
-
-void THCudaTensor_div(THCState* state, THCudaTensor *self_, THCudaTensor *src_, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self_, src_));
- THArgCheck(value != 0.0f, 3, "divide by zero");
-
- if (self_ == src_) {
- if (!THCudaTensor_pointwiseApply1(state, self_, TensorMulConstantOp(1.0f / value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- } else {
- THCudaTensor_resizeAs(state, self_, src_);
-
- if (!THCudaTensor_pointwiseApply2(state, self_, src_, TensorMulConstantOp(1.0f / value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
+ __device__ __forceinline__ void operator()(half* v) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *v = __hmul(*v, val);
+#else
+ float fv = __half2float(*v);
+ float fval = __half2float(val);
+ fv *= fval;
+ *v = __float2half(fv);
+#endif
}
- THCudaCheck(cudaGetLastError());
-}
+ const half val;
+};
+#endif // CUDA_HALF_TENSOR
template <int Upper>
struct TensorTriOp {
@@ -143,13 +145,13 @@ void THCudaTensor_tril(THCState *state, THCudaTensor *self_, THCudaTensor *src_,
TensorTriOp<0> op(start, stride0, stride1, k);
if (self_ == src_) {
- if (!THCudaTensor_pointwiseApply1(state, src, op)) {
+ if (!THC_pointwiseApply1(state, src, op)) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self_, src);
- if (!THCudaTensor_pointwiseApply2(state, self_, src, op)) {
+ if (!THC_pointwiseApply2(state, self_, src, op)) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
@@ -176,13 +178,13 @@ void THCudaTensor_triu(THCState *state, THCudaTensor *self_, THCudaTensor *src_,
TensorTriOp<1> op(start, stride0, stride1, k);
if (self_ == src_) {
- if (!THCudaTensor_pointwiseApply1(state, src, op)) {
+ if (!THC_pointwiseApply1(state, src, op)) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self_, src);
- if (!THCudaTensor_pointwiseApply2(state, self_, src, op)) {
+ if (!THC_pointwiseApply2(state, self_, src, op)) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
@@ -192,3 +194,6 @@ void THCudaTensor_triu(THCState *state, THCudaTensor *self_, THCudaTensor *src_,
THCudaCheck(cudaGetLastError());
}
+
+#include "generic/THCTensorMathPairwise.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathPointwise.cu b/lib/THC/THCTensorMathPointwise.cu
index 80b1ead..72b16e8 100644
--- a/lib/THC/THCTensorMathPointwise.cu
+++ b/lib/THC/THCTensorMathPointwise.cu
@@ -1,6 +1,7 @@
#include "THCTensorMath.h"
#include "THCGeneral.h"
#include "THCBlas.h"
+#include "THCHalf.h"
#include "THCTensorCopy.h"
#include "THCApply.cuh"
#include "THCReduce.cuh"
@@ -19,13 +20,13 @@
void THCudaTensor_##NAME(THCState* state, THCudaTensor* self_, THCudaTensor* src) { \
THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); \
if (self_ == src) { \
- if (!THCudaTensor_pointwiseApply1(state, self_, Tensor##NAME##Op())) { \
+ if (!THC_pointwiseApply1(state, self_, Tensor##NAME##Op())) { \
THArgCheck(false, 2, CUTORCH_DIM_WARNING); \
} \
} else { \
THCudaTensor_resizeAs(state, self_, src); \
\
- if (!THCudaTensor_pointwiseApply2(state, self_, src, Tensor##NAME##Op())) { \
+ if (!THC_pointwiseApply2(state, self_, src, Tensor##NAME##Op())) { \
THArgCheck(false, 2, CUTORCH_DIM_WARNING); \
} \
} \
@@ -76,171 +77,13 @@ struct TensorSigmoidOp {
void THCudaTensor_sigmoid(THCState* state, THCudaTensor* self_, THCudaTensor* src) {
THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
if (self_ == src) {
- if (!THCudaTensor_pointwiseApply1(state, self_, TensorSigmoidOp())) {
+ if (!THC_pointwiseApply1(state, self_, TensorSigmoidOp())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self_, src);
- if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorSigmoidOp())) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
-struct TensorAddOp {
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out += *in;
- }
-
- __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
- *out = *in1 + *in2;
- }
-};
-
-struct TensorCAddOp {
- TensorCAddOp(float v) : val(v) {}
-
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out += val * *in;
- }
-
- __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
- *out = *in1 + val * *in2;
- }
-
- float val;
-};
-
-void THCudaTensor_cadd(THCState *state, THCudaTensor *self_, THCudaTensor* src1, float value, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THArgCheck(THCudaTensor_nElement(state, src1) ==
- THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
- if (self_ == src1) {
- if (value == 1.0f) {
- // self += src2
- if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorAddOp())) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- } else {
- // self += value * src2
- if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorCAddOp(value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- }
- } else {
- THCudaTensor_resizeAs(state, self_, src1);
-
- if (value == 1.0f) {
- // self = src1 + src2
- if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorAddOp())) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- } else {
- // self = src1 + value * src2
- if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorCAddOp(value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- }
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
-struct TensorSubOp {
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out -= *in;
- }
-
- __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
- *out = *in1 - *in2;
- }
-};
-
-
-struct TensorCSubOp {
- TensorCSubOp(float v) : val(v) {}
-
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out -= val * *in;
- }
-
- __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
- *out = *in1 - val * *in2;
- }
-
- float val;
-};
-
-
-void THCudaTensor_csub(THCState *state, THCudaTensor *self_, THCudaTensor* src1, float value, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THArgCheck(THCudaTensor_nElement(state, src1) ==
- THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
- if (self_ == src1) {
- if (value == 1.0f) {
- // self -= src2
- if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorSubOp())) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- } else {
- // self += -value * src2
- if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorCAddOp(-value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- }
- } else {
- THCudaTensor_resizeAs(state, self_, src1);
-
- if (value == 1.0f) {
- // self = src1 - src2
- if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorSubOp())) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- } else {
- // self = src1 - value * src2
- if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorCAddOp(-value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- }
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
-
-struct TensorMulOp {
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out *= *in;
- }
-
- __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
- *out = *in1 * *in2;
- }
-};
-
-void THCudaTensor_cmul(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THArgCheck(THCudaTensor_nElement(state, src1) ==
- THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
- if (self_ == src1) {
- // self *= src2
- if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorMulOp())) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
- } else {
- THCudaTensor_resizeAs(state, self_, src1);
-
- // self = src1 * src2
- if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorMulOp())) {
+ if (!THC_pointwiseApply2(state, self_, src, TensorSigmoidOp())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
@@ -265,12 +108,12 @@ void THCudaTensor_cmax(THCState *state, THCudaTensor *self, THCudaTensor *src1,
THCudaTensor_nElement(state, src2), 2, "sizes do not match");
if (self == src1) {
- if (!THCudaTensor_pointwiseApply2(state, self, src2, TensorMaxOp())) {
+ if (!THC_pointwiseApply2(state, self, src2, TensorMaxOp())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self, src1);
- if (!THCudaTensor_pointwiseApply3(state, self, src1, src2, TensorMaxOp())) {
+ if (!THC_pointwiseApply3(state, self, src1, src2, TensorMaxOp())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
@@ -293,12 +136,12 @@ void THCudaTensor_cmin(THCState *state, THCudaTensor *self, THCudaTensor *src1,
THCudaTensor_nElement(state, src2), 2, "sizes do not match");
if (self == src1) {
- if (!THCudaTensor_pointwiseApply2(state, self, src2, TensorMinOp())) {
+ if (!THC_pointwiseApply2(state, self, src2, TensorMinOp())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self, src1);
- if (!THCudaTensor_pointwiseApply3(state, self, src1, src2, TensorMinOp())) {
+ if (!THC_pointwiseApply3(state, self, src1, src2, TensorMinOp())) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
@@ -323,12 +166,12 @@ void THCudaTensor_cmaxValue(THCState *state, THCudaTensor *self, THCudaTensor *s
THAssert(THCudaTensor_checkGPU(state, 2, self, src));
if (self == src) {
- if (!THCudaTensor_pointwiseApply1(state, self, TensorMaxValueOp(value))) {
+ if (!THC_pointwiseApply1(state, self, TensorMaxValueOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self, src);
- if (!THCudaTensor_pointwiseApply2(state, self, src, TensorMaxValueOp(value))) {
+ if (!THC_pointwiseApply2(state, self, src, TensorMaxValueOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
@@ -353,13 +196,259 @@ void THCudaTensor_cminValue(THCState *state, THCudaTensor *self, THCudaTensor *s
THAssert(THCudaTensor_checkGPU(state, 2, self, src));
if (self == src) {
- if (!THCudaTensor_pointwiseApply1(state, self, TensorMinValueOp(value))) {
+ if (!THC_pointwiseApply1(state, self, TensorMinValueOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
} else {
THCudaTensor_resizeAs(state, self, src);
- if (!THCudaTensor_pointwiseApply2(state, self, src, TensorMinValueOp(value))) {
+ if (!THC_pointwiseApply2(state, self, src, TensorMinValueOp(value))) {
THArgCheck(false, 2, CUTORCH_DIM_WARNING);
}
}
}
+
+template <typename T>
+struct TensorAddOp {
+ __device__ __forceinline__ void operator()(T* out, T* in) {
+ *out += *in;
+ }
+
+ __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+ *out = *in1 + *in2;
+ }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorAddOp<half> {
+ __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *out = __hadd(*out, *in);
+#else
+ float fout = __half2float(*out);
+ float fin = __half2float(*in);
+ fout += fin;
+ *out = __float2half(fout);
+#endif
+ }
+
+ __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *out = __hadd(*in1, *in2);
+#else
+ float fin1 = __half2float(*in1);
+ float fin2 = __half2float(*in2);
+ float fout = fin1 + fin2;
+ *out = __float2half(fout);
+#endif
+ }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorCAddOp {
+ TensorCAddOp(T v) : val(v) {}
+
+ __device__ __forceinline__ void operator()(T* out, T* in) {
+ *out += val * *in;
+ }
+
+ __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+ *out = *in1 + val * *in2;
+ }
+
+ T val;
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCAddOp<half> {
+ TensorCAddOp(half v) : val(v) {}
+
+ __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *out = __hadd(*out, __hmul(val, *in));
+#else
+ float fout = __half2float(*out);
+ float fval = __half2float(val);
+ float fin = __half2float(*in);
+
+ fout += fval * fin;
+ *out = __float2half(fout);
+#endif
+ }
+
+ __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *out = __hadd(*in1, __hmul(val, *in2));
+#else
+ float fin1 = __half2float(*in1);
+ float fin2 = __half2float(*in2);
+ float fval = __half2float(val);
+
+ float fout = fin1 + fval * fin2;
+ *out = __float2half(fout);
+#endif
+ }
+
+ half val;
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorSubOp {
+ __device__ __forceinline__ void operator()(T* out, T* in) {
+ *out -= *in;
+ }
+
+ __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+ *out = *in1 - *in2;
+ }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorSubOp<half> {
+ __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *out = __hsub(*out, *in);
+#else
+ float fout = __half2float(*out);
+ float fin = __half2float(*in);
+ fout -= fin;
+ *out = __float2half(fout);
+#endif
+ }
+
+ __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *out = __hsub(*in1, *in2);
+#else
+ float fin1 = __half2float(*in1);
+ float fin2 = __half2float(*in2);
+ float fout = fin1 - fin2;
+ *out = __float2half(fout);
+#endif
+ }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorMulOp {
+ __device__ __forceinline__ void operator()(T* out, T* in) {
+ *out *= *in;
+ }
+
+ __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+ *out = *in1 * *in2;
+ }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorMulOp<half> {
+ __device__ __forceinline__ void operator()(half* out, half* in) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *out = __hmul(*out, *in);
+#else
+ float fout = __half2float(*out);
+ float fin = __half2float(*in);
+ fout *= fin;
+ *out = __float2half(fout);
+#endif
+ }
+
+ __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ *out = __hmul(*in1, *in2);
+#else
+ float fin1 = __half2float(*in1);
+ float fin2 = __half2float(*in2);
+ float fout = fin1 * fin2;
+ *out = __float2half(fout);
+#endif
+ }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorCPowOp {
+ __device__ __forceinline__ void operator()(T* out, T* in) {
+ *out = powf((float) *out, (float) *in);
+ }
+
+ __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
+ *out = powf((float) *in1, (float) *in2);
+ }
+};
+
+template <>
+struct TensorCPowOp<double> {
+ __device__ __forceinline__ void operator()(double* out, double* in) {
+ *out = pow(*out, *in);
+ }
+
+ __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
+ *out = pow(*in1, *in2);
+ }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorCPowOp<half> {
+ __device__ __forceinline__ void operator()(half* out, half* in) {
+ // No fp16 pow function yet
+ float fout = __half2float(*out);
+ float fin = __half2float(*in);
+ fout = powf(fout, fin);
+ *out = __float2half(fout);
+ }
+
+ __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) {
+ // No fp16 pow function yet
+ float fin1 = __half2float(*in1);
+ float fin2 = __half2float(*in2);
+ float fout = powf(fin1, fin2);
+ *out = __float2half(fout);
+ }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct TensorDivOp {
+ __device__ __forceinline__ void
+ operator()(T* out, T* in) {
+ *out /= *in;
+ }
+
+ __device__ __forceinline__ void
+ operator()(T* out, T* in1, T* in2) {
+ *out = *in1 / *in2;
+ }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct TensorDivOp<half> {
+ __device__ __forceinline__ void
+ operator()(half* out, half* in) {
+ // No fp16 div instruction yet
+ float fout = __half2float(*out);
+ float fin = __half2float(*in);
+ fout /= fin;
+ *out = __float2half(fout);
+ }
+
+ __device__ __forceinline__ void
+ operator()(half* out, half* in1, half* in2) {
+ // No fp16 div instruction yet
+ float fin1 = __half2float(*in1);
+ float fin2 = __half2float(*in2);
+ float fout = fin1 / fin2;
+ *out = __float2half(fout);
+ }
+};
+#endif // CUDA_HALF_TENSOR
+
+#include "generic/THCTensorMathPointwise.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorScatterGather.cu b/lib/THC/THCTensorScatterGather.cu
index 876f32a..1059b03 100644
--- a/lib/THC/THCTensorScatterGather.cu
+++ b/lib/THC/THCTensorScatterGather.cu
@@ -2,7 +2,6 @@
#include "THCGeneral.h"
#include "THCApply.cuh"
-
// Compute the offsets into the given tensors for a linear index. For the 't2'
// tensor, dimension 'dim' is skipped. The tensors are assumed to have the same
// size (with the exception of 't2' in dimension 'dim').
@@ -11,9 +10,9 @@ template <typename IndexType, int Dims>
struct IndexToScatterGatherOffsets {
static __device__ void compute(
IndexType linearId, const int dim,
- const TensorInfo<IndexType>& index, IndexType* indexOffset,
- const TensorInfo<IndexType>& t1, IndexType* t1Offset,
- const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+ const TensorInfo<float, IndexType>& index, IndexType* indexOffset,
+ const TensorInfo<float, IndexType>& t1, IndexType* t1Offset,
+ const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) {
for (int d = Dims - 1; d >= 0; d--) {
IndexType curDimIndex = linearId % index.sizes[d];
*indexOffset += curDimIndex * index.strides[d];
@@ -27,8 +26,8 @@ struct IndexToScatterGatherOffsets {
static __device__ void compute(
IndexType linearId, const int dim,
- const TensorInfo<IndexType>& index, IndexType* indexOffset,
- const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+ const TensorInfo<float, IndexType>& index, IndexType* indexOffset,
+ const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) {
for (int d = Dims - 1; d >= 0; d--) {
IndexType curDimIndex = linearId % index.sizes[d];
*indexOffset += curDimIndex * index.strides[d];
@@ -45,9 +44,9 @@ template <typename IndexType>
struct IndexToScatterGatherOffsets<IndexType, -1> {
static __device__ void compute(
IndexType linearId, const int dim,
- const TensorInfo<IndexType>& index, IndexType* indexOffset,
- const TensorInfo<IndexType>& t1, IndexType* t1Offset,
- const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+ const TensorInfo<float, IndexType>& index, IndexType* indexOffset,
+ const TensorInfo<float, IndexType>& t1, IndexType* t1Offset,
+ const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) {
for (int d = index.dims - 1; d >= 0; d--) {
IndexType curDimIndex = linearId % index.sizes[d];
*indexOffset += curDimIndex * index.strides[d];
@@ -61,8 +60,8 @@ struct IndexToScatterGatherOffsets<IndexType, -1> {
static __device__ void compute(
IndexType linearId, const int dim,
- const TensorInfo<IndexType>& index, IndexType* indexOffset,
- const TensorInfo<IndexType>& t2, IndexType* t2Offset) {
+ const TensorInfo<float, IndexType>& index, IndexType* indexOffset,
+ const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) {
for (int d = index.dims - 1; d >= 0; d--) {
IndexType curDimIndex = linearId % index.sizes[d];
*indexOffset += curDimIndex * index.strides[d];
@@ -77,9 +76,9 @@ struct IndexToScatterGatherOffsets<IndexType, -1> {
template <typename IndexType, int Dims>
__global__ void THCudaTensor_gatherKernel(
- TensorInfo<IndexType> tensor,
- TensorInfo<IndexType> src,
- TensorInfo<IndexType> index,
+ TensorInfo<float, IndexType> tensor,
+ TensorInfo<float, IndexType> src,
+ TensorInfo<float, IndexType> index,
const int dim,
const IndexType totalElements) {
for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
@@ -135,17 +134,20 @@ void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *sr
THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
THCudaTensor* oldTensor = NULL;
- if (THC_overlappingIndices(state, tensor)) {
+ if (TensorUtils<THCudaTensor>::overlappingIndices(state, tensor)) {
oldTensor = tensor;
tensor = THCudaTensor_newContiguous(state, tensor);
}
- if (THC_canUse32BitIndexMath(state, tensor) &&
- THC_canUse32BitIndexMath(state, src) &&
- THC_canUse32BitIndexMath(state, index)) {
- TensorInfo<unsigned int> tensorInfo(state, tensor);
- TensorInfo<unsigned int> srcInfo(state, src);
- TensorInfo<unsigned int> indexInfo(state, index);
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, tensor) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, index)) {
+ TensorInfo<float, unsigned int> tensorInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, tensor);
+ TensorInfo<float, unsigned int> srcInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, src);
+ TensorInfo<float, unsigned int> indexInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, index);
// Specialize for a small number of dimensions.
switch (indexInfo.dims) {
@@ -163,15 +165,18 @@ void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *sr
break;
}
} else {
- TensorInfo<unsigned long> tensorInfo(state, tensor);
- TensorInfo<unsigned long> srcInfo(state, src);
- TensorInfo<unsigned long> indexInfo(state, index);
+ TensorInfo<float, unsigned long> tensorInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, tensor);
+ TensorInfo<float, unsigned long> srcInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, src);
+ TensorInfo<float, unsigned long> indexInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, index);
RUN(unsigned long, -1)
}
if (oldTensor) {
- THCudaTensor_copyIgnoringOverlaps(state, oldTensor, tensor);
+ TensorUtils<THCudaTensor>::copyIgnoringOverlaps(state, oldTensor, tensor);
THCudaTensor_free(state, tensor);
tensor = oldTensor;
}
@@ -182,9 +187,9 @@ void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *sr
template <typename IndexType, int Dims>
__global__ void THCudaTensor_scatterKernel(
- TensorInfo<IndexType> tensor,
- TensorInfo<IndexType> src,
- TensorInfo<IndexType> index,
+ TensorInfo<float, IndexType> tensor,
+ TensorInfo<float, IndexType> src,
+ TensorInfo<float, IndexType> index,
const int dim,
const IndexType totalElements) {
for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x;
@@ -230,7 +235,7 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda
}
}
- THArgCheck(THCudaTensor_nDimension(state, tensor) <= MAX_CUTORCH_DIMS,
+ THArgCheck(THCudaTensor_nDimension(state, tensor) <= MAX_CUTORCH_DIMS,
1, CUTORCH_DIM_WARNING);
const long totalElements = THCudaTensor_nElement(state, index);
@@ -239,17 +244,20 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda
THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
THCudaTensor* oldTensor = NULL;
- if (THC_overlappingIndices(state, tensor)) {
+ if (TensorUtils<THCudaTensor>::overlappingIndices(state, tensor)) {
oldTensor = tensor;
tensor = THCudaTensor_newContiguous(state, tensor);
}
- if (THC_canUse32BitIndexMath(state, tensor) &&
- THC_canUse32BitIndexMath(state, src) &&
- THC_canUse32BitIndexMath(state, index)) {
- TensorInfo<unsigned int> tensorInfo(state, tensor);
- TensorInfo<unsigned int> srcInfo(state, src);
- TensorInfo<unsigned int> indexInfo(state, index);
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, tensor) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, index)) {
+ TensorInfo<float, unsigned int> tensorInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, tensor);
+ TensorInfo<float, unsigned int> srcInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, src);
+ TensorInfo<float, unsigned int> indexInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, index);
// Specialize for a small number of dimensions.
switch (indexInfo.dims) {
@@ -267,15 +275,18 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda
break;
}
} else {
- TensorInfo<unsigned long> tensorInfo(state, tensor);
- TensorInfo<unsigned long> srcInfo(state, src);
- TensorInfo<unsigned long> indexInfo(state, index);
+ TensorInfo<float, unsigned long> tensorInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, tensor);
+ TensorInfo<float, unsigned long> srcInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, src);
+ TensorInfo<float, unsigned long> indexInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, index);
RUN(unsigned long, -1)
}
if (oldTensor) {
- THCudaTensor_copyIgnoringOverlaps(state, oldTensor, tensor);
+ TensorUtils<THCudaTensor>::copyIgnoringOverlaps(state, oldTensor, tensor);
THCudaTensor_free(state, tensor);
tensor = oldTensor;
}
@@ -286,8 +297,8 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda
template <typename IndexType, int Dims>
__global__ void THCudaTensor_scatterFillKernel(
- TensorInfo<IndexType> tensor,
- TensorInfo<IndexType> index,
+ TensorInfo<float, IndexType> tensor,
+ TensorInfo<float, IndexType> index,
float value,
const int dim,
const IndexType totalElements) {
@@ -313,17 +324,21 @@ __global__ void THCudaTensor_scatterFillKernel(
<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
tensorInfo, indexInfo, value, dim, (TYPE)totalElements);
-void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, THCudaTensor *index, float value) {
+void
+THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor,
+ int dim, THCudaTensor *index, float value) {
THAssert(THCudaTensor_checkGPU(state, 2, tensor, index));
THArgCheck(dim >= 0 && dim < THCudaTensor_nDimension(state, tensor), 2,
"Index dimension is out of bounds");
- THArgCheck(THCudaTensor_nDimension(state, index) == THCudaTensor_nDimension(state, tensor), 3,
+ THArgCheck(THCudaTensor_nDimension(state, index) ==
+ THCudaTensor_nDimension(state, tensor), 3,
"Index tensor must have same dimensions as output tensor");
for (int d = 0; d < THCudaTensor_nDimension(state, tensor); d++) {
if (d != dim) {
- THArgCheck(THCudaTensor_size(state, tensor, d) == THCudaTensor_size(state, index, d), 4,
+ THArgCheck(THCudaTensor_size(state, tensor, d) ==
+ THCudaTensor_size(state, index, d), 4,
"Index tensor must have same size as output tensor apart from the specified dimension");
}
}
@@ -337,15 +352,17 @@ void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, TH
THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
THCudaTensor* oldTensor = NULL;
- if (THC_overlappingIndices(state, tensor)) {
+ if (TensorUtils<THCudaTensor>::overlappingIndices(state, tensor)) {
oldTensor = tensor;
tensor = THCudaTensor_newContiguous(state, tensor);
}
- if (THC_canUse32BitIndexMath(state, tensor) &&
- THC_canUse32BitIndexMath(state, index)) {
- TensorInfo<unsigned int> tensorInfo(state, tensor);
- TensorInfo<unsigned int> indexInfo(state, index);
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, tensor) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, index)) {
+ TensorInfo<float, unsigned int> tensorInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, tensor);
+ TensorInfo<float, unsigned int> indexInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, index);
// Specialize for a small number of dimensions.
switch (indexInfo.dims) {
@@ -363,14 +380,16 @@ void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, TH
break;
}
} else {
- TensorInfo<unsigned long> tensorInfo(state, tensor);
- TensorInfo<unsigned long> indexInfo(state, index);
+ TensorInfo<float, unsigned long> tensorInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, tensor);
+ TensorInfo<float, unsigned long> indexInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, index);
RUN(unsigned long, -1);
}
if (oldTensor) {
- THCudaTensor_copyIgnoringOverlaps(state, oldTensor, tensor);
+ TensorUtils<THCudaTensor>::copyIgnoringOverlaps(state, oldTensor, tensor);
THCudaTensor_free(state, tensor);
tensor = oldTensor;
}
diff --git a/lib/THC/THCTensorSort.cu b/lib/THC/THCTensorSort.cu
index c35a36a..3b9562d 100644
--- a/lib/THC/THCTensorSort.cu
+++ b/lib/THC/THCTensorSort.cu
@@ -1,6 +1,7 @@
#include "THCReduceApplyUtils.cuh"
#include "THCSortUtils.cuh"
#include "THCTensorCopy.h"
+#include "THCTensorTypeUtils.cuh"
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
@@ -28,7 +29,7 @@ unsigned long nextHighestPowerOf2(unsigned long n) {
// `sliceSize - 1`.
template <typename IndexType, int Dim>
__global__ void
-fillSliceWithIndex(TensorInfo<IndexType> out,
+fillSliceWithIndex(TensorInfo<float, IndexType> out,
IndexType totalSlices,
IndexType sliceSize,
IndexType sliceStride) {
@@ -39,7 +40,7 @@ fillSliceWithIndex(TensorInfo<IndexType> out,
}
const unsigned long offset =
- IndexToOffset<IndexType, Dim>::get(slice, out);
+ IndexToOffset<float, IndexType, Dim>::get(slice, out);
float* base = &out.data[offset];
for (long i = threadIdx.x; i < sliceSize; i += blockDim.x) {
@@ -76,9 +77,10 @@ void THCudaTensor_fillSliceWithIndex(THCState* state,
<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
info, numSlices, sliceSize, info.strides[collapseDim])
- if (THC_canUse32BitIndexMath(state, t)) {
- TensorInfo<unsigned int> info(state, t, dim);
- info.sizes[dim] = 1;
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, t)) {
+ TensorInfo<float, unsigned int> info =
+ getTensorInfo<THCudaTensor, unsigned int>(state, t);
+ info.reduceDim(dim);
int collapseDim = info.collapseDims(dim);
if (info.isContiguous()) {
@@ -93,8 +95,9 @@ void THCudaTensor_fillSliceWithIndex(THCState* state,
}
}
} else {
- TensorInfo<unsigned long> info(state, t, dim);
- info.sizes[dim] = 1;
+ TensorInfo<float, unsigned long> info =
+ getTensorInfo<THCudaTensor, unsigned long>(state, t);
+ info.reduceDim(dim);
int collapseDim = info.collapseDims(dim);
// catch-all implementation
@@ -221,13 +224,15 @@ THC_API void THCudaTensor_sortKeyValueInplace(THCState* state,
// The constructed key/value tensor info is used to select the slice
// we are sorting on a per-block basis
- if (THC_canUse32BitIndexMath(state, key)) {
- TensorInfo<unsigned int> keyInfo(state, key);
- keyInfo.sizes[dim] = 1;
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, key)) {
+ TensorInfo<float, unsigned int> keyInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, key);
+ keyInfo.reduceDim(dim);
int collapseKeyDim = keyInfo.collapseDims(dim);
- TensorInfo<unsigned int> valueInfo(state, value);
- valueInfo.sizes[dim] = 1;
+ TensorInfo<float, unsigned int> valueInfo =
+ getTensorInfo<THCudaTensor, unsigned int>(state, value);
+ valueInfo.reduceDim(dim);
int collapseValueDim = valueInfo.collapseDims(dim);
if (keyInfo.isContiguous()) {
@@ -246,12 +251,14 @@ THC_API void THCudaTensor_sortKeyValueInplace(THCState* state,
}
}
} else {
- TensorInfo<unsigned long> keyInfo(state, key);
- keyInfo.sizes[dim] = 1;
+ TensorInfo<float, unsigned long> keyInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, key);
+ keyInfo.reduceDim(dim);
int collapseKeyDim = keyInfo.collapseDims(dim);
- TensorInfo<unsigned long> valueInfo(state, value);
- valueInfo.sizes[dim] = 1;
+ TensorInfo<float, unsigned long> valueInfo =
+ getTensorInfo<THCudaTensor, unsigned long>(state, value);
+ valueInfo.reduceDim(dim);
int collapseValueDim = valueInfo.collapseDims(dim);
// long case is rare, just instantiate these versions
diff --git a/lib/THC/THCTensorTopK.cu b/lib/THC/THCTensorTopK.cu
index a8c189f..582143e 100644
--- a/lib/THC/THCTensorTopK.cu
+++ b/lib/THC/THCTensorTopK.cu
@@ -3,6 +3,7 @@
#include "THCTensorSort.h"
#include "THCAsmUtils.cuh"
#include "THCScanUtils.cuh"
+#include "THCTensorTypeUtils.cuh"
#include <algorithm> // for std::min
#if CUDA_VERSION >= 7000
@@ -248,18 +249,18 @@ __device__ void radixSelect(const RadixConverter& conv,
}
template <typename IndexType, int Dim, bool Order>
-__global__ void gatherTopK(TensorInfo<IndexType> input,
+__global__ void gatherTopK(TensorInfo<float, IndexType> input,
IndexType inputSliceSize,
IndexType outputSliceSize, // aka `k`
IndexType numInputSlices,
IndexType inputWithinSliceStride,
- TensorInfo<IndexType> topK,
+ TensorInfo<float, IndexType> topK,
IndexType numTopKSlices,
IndexType topKWithinSliceStride,
- TensorInfo<IndexType> indices,
+ TensorInfo<float, IndexType> indices,
IndexType indicesWithinSliceStride) {
// Indices are limited to integer fp precision, so counts can fit in
// int32, regardless of IndexType
@@ -272,11 +273,11 @@ __global__ void gatherTopK(TensorInfo<IndexType> input,
// Find the start offset for our slice
IndexType sliceStartIndex =
- IndexToOffset<IndexType, Dim>::get(slice, input);
+ IndexToOffset<float, IndexType, Dim>::get(slice, input);
IndexType topKSliceStartIndex =
- IndexToOffset<IndexType, Dim>::get(slice, topK);
+ IndexToOffset<float, IndexType, Dim>::get(slice, topK);
IndexType indicesSliceStartIndex =
- IndexToOffset<IndexType, Dim>::get(slice, indices);
+ IndexToOffset<float, IndexType, Dim>::get(slice, indices);
float* inputSliceStart = &input.data[sliceStartIndex];
float* topKSliceStart = &topK.data[topKSliceStartIndex];
@@ -414,8 +415,8 @@ THC_API void THCudaTensor_topk(THCState* state,
THCudaTensor_resize(state, indices, topKSize, NULL);
THLongStorage_free(topKSize);
-#define RUN_K(T, DIM, DIR) \
- gatherTopK<T, DIM, DIR> \
+#define RUN_K(INDEX_T, DIM, DIR) \
+ gatherTopK<INDEX_T, DIM, DIR> \
<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
inputInfo, \
sliceSize, \
@@ -430,70 +431,73 @@ THC_API void THCudaTensor_topk(THCState* state,
indicesInfo, \
indicesInfo.strides[collapseIndicesDim])
-#define RUN_DIR(T, DIM) \
+#define RUN_DIR(INDEX_T, DIM) \
if (dir) { \
- RUN_K(T, DIM, true); \
+ RUN_K(INDEX_T, DIM, true); \
} else { \
- RUN_K(T, DIM, false); \
+ RUN_K(INDEX_T, DIM, false); \
}
-#define RUN_DIM(T) \
+#define RUN_DIM(INDEX_T) \
if (allDims == 1) { \
- RUN_DIR(T, 1); \
+ RUN_DIR(INDEX_T, 1); \
} else if (allDims == 2) { \
- RUN_DIR(T, 2); \
+ RUN_DIR(INDEX_T, 2); \
} else if (allDims == 3) { \
- RUN_DIR(T, 3); \
+ RUN_DIR(INDEX_T, 3); \
} else { \
- RUN_DIR(T, -1); \
+ RUN_DIR(INDEX_T, -1); \
}
-#define RUN_T(T) \
- TensorInfo<T> inputInfo(state, input); \
- TensorInfo<T> topKInfo(state, topK); \
- TensorInfo<T> indicesInfo(state, indices); \
+#define RUN_T(INDEX_T) \
+ TensorInfo<float, INDEX_T> inputInfo = \
+ getTensorInfo<THCudaTensor, INDEX_T>(state, input); \
+ TensorInfo<float, INDEX_T> topKInfo = \
+ getTensorInfo<THCudaTensor, INDEX_T>(state, topK); \
+ TensorInfo<float, INDEX_T> indicesInfo = \
+ getTensorInfo<THCudaTensor, INDEX_T>(state, indices); \
\
- /* We use these structures solely to find the offset to */ \
- /* each slice we are operating on */ \
- inputInfo.sizes[dim] = 1; \
- topKInfo.sizes[dim] = 1; \
- indicesInfo.sizes[dim] = 1; \
+ /* We use these structures solely to find the offset to */ \
+ /* each slice we are operating on */ \
+ inputInfo.sizes[dim] = 1; \
+ topKInfo.sizes[dim] = 1; \
+ indicesInfo.sizes[dim] = 1; \
\
- /* Collapse all other dims */ \
- int collapseInputDim = inputInfo.collapseDims(dim); \
- int collapseTopKDim = topKInfo.collapseDims(dim); \
- int collapseIndicesDim = indicesInfo.collapseDims(dim); \
+ /* Collapse all other dims */ \
+ int collapseInputDim = inputInfo.collapseDims(dim); \
+ int collapseTopKDim = topKInfo.collapseDims(dim); \
+ int collapseIndicesDim = indicesInfo.collapseDims(dim); \
\
- long inputSlices = 1; \
- long topKSlices = 1; \
- for (int i = 0; i < numDims; ++i) { \
- inputSlices *= inputInfo.sizes[i]; \
- topKSlices *= topKInfo.sizes[i]; \
- } \
+ long inputSlices = 1; \
+ long topKSlices = 1; \
+ for (int i = 0; i < numDims; ++i) { \
+ inputSlices *= inputInfo.sizes[i]; \
+ topKSlices *= topKInfo.sizes[i]; \
+ } \
\
- dim3 grid; \
- if (!THC_getGridFromTiles(inputSlices, grid)) { \
- THError("Slice to sort is too large"); \
- } \
+ dim3 grid; \
+ if (!THC_getGridFromTiles(inputSlices, grid)) { \
+ THError("Slice to sort is too large"); \
+ } \
\
- dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L)); \
+ dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L)); \
\
- /* This is used as a template parameter to calculate indices. */ \
- /* We only specialize it if all collapsed dim sizes are the */ \
- /* same; otherwise, we use -1 which is the specialization */ \
- /* parameter for arbitrary dimensions */ \
- int allDims = inputInfo.dims; \
- if (topKInfo.dims != allDims || indicesInfo.dims != allDims) { \
- allDims = -1; \
- } \
+ /* This is used as a template parameter to calculate indices. */ \
+ /* We only specialize it if all collapsed dim sizes are the */ \
+ /* same; otherwise, we use -1 which is the specialization */ \
+ /* parameter for arbitrary dimensions */ \
+ int allDims = inputInfo.dims; \
+ if (topKInfo.dims != allDims || indicesInfo.dims != allDims) { \
+ allDims = -1; \
+ } \
\
- RUN_DIM(T);
+ RUN_DIM(INDEX_T);
// Based on required index size, run the algorithm with the
// appropriate index type
- if (THC_canUse32BitIndexMath(state, input) &&
- THC_canUse32BitIndexMath(state, topK) &&
- THC_canUse32BitIndexMath(state, indices)) {
+ if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, topK) &&
+ TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) {
RUN_T(unsigned int);
} else {
RUN_T(unsigned long);
diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu
new file mode 100644
index 0000000..fc9bd60
--- /dev/null
+++ b/lib/THC/THCTensorTypeUtils.cu
@@ -0,0 +1,210 @@
+#include "THCTensorTypeUtils.cuh"
+#include "THCTensor.h"
+#include "THCTensorCopy.h"
+#include "THCHalf.h"
+#include <stdlib.h>
+
+namespace {
+
+struct SizeAndStride {
+ long size;
+ long stride;
+};
+
+int compareSizeAndStride(const void* a, const void* b) {
+ const SizeAndStride* aS = (const SizeAndStride*) a;
+ const SizeAndStride* bS = (const SizeAndStride*) b;
+
+ return aS->stride < bS->stride;
+}
+
+}
+
+#define IMPL_TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE) \
+ \
+TENSOR_TYPE* \
+TensorUtils<TENSOR_TYPE>::newTensor(THCState* state) { \
+ return TENSOR_TYPE##_new(state); \
+} \
+ \
+TENSOR_TYPE* \
+TensorUtils<TENSOR_TYPE>::newContiguous(THCState* state, \
+ TENSOR_TYPE* t) { \
+ return TENSOR_TYPE##_newContiguous(state, t); \
+} \
+ \
+void \
+TensorUtils<TENSOR_TYPE>::retain(THCState* state, \
+ TENSOR_TYPE* t) { \
+ TENSOR_TYPE##_retain(state, t); \
+} \
+ \
+void \
+TensorUtils<TENSOR_TYPE>::free(THCState* state, \
+ TENSOR_TYPE* t) { \
+ TENSOR_TYPE##_free(state, t); \
+} \
+ \
+void \
+TensorUtils<TENSOR_TYPE>::freeCopyTo(THCState* state, \
+ TENSOR_TYPE* src, \
+ TENSOR_TYPE* dst) { \
+ TENSOR_TYPE##_freeCopyTo(state, src, dst); \
+} \
+ \
+void \
+TensorUtils<TENSOR_TYPE>::resizeAs(THCState* state, \
+ TENSOR_TYPE* dst, \
+ TENSOR_TYPE* src) { \
+ TENSOR_TYPE##_resizeAs(state, dst, src); \
+} \
+ \
+DATA_TYPE* \
+TensorUtils<TENSOR_TYPE>::getData(THCState* state, \
+ TENSOR_TYPE* t) { \
+ /* FIXME: no cast is required except for THCudaHalfTensor */ \
+ return (DATA_TYPE*) TENSOR_TYPE##_data(state, t); \
+} \
+ \
+long \
+TensorUtils<TENSOR_TYPE>::getNumElements(THCState* state, \
+ TENSOR_TYPE* t) { \
+ return TENSOR_TYPE##_nElement(state, t); \
+} \
+ \
+long \
+TensorUtils<TENSOR_TYPE>::getSize(THCState* state, \
+ TENSOR_TYPE* t, \
+ int dim) { \
+ return TENSOR_TYPE##_size(state, t, dim); \
+} \
+ \
+long \
+TensorUtils<TENSOR_TYPE>::getStride(THCState* state, \
+ TENSOR_TYPE* t, \
+ int dim) { \
+ return TENSOR_TYPE##_stride(state, t, dim); \
+} \
+ \
+int \
+TensorUtils<TENSOR_TYPE>::getDims(THCState* state, \
+ TENSOR_TYPE* t) { \
+ return TENSOR_TYPE##_nDimension(state, t); \
+} \
+ \
+bool \
+TensorUtils<TENSOR_TYPE>::isContiguous(THCState* state, \
+ TENSOR_TYPE* t) { \
+ return TENSOR_TYPE##_isContiguous(state, t); \
+} \
+ \
+int \
+TensorUtils<TENSOR_TYPE>::getDevice(THCState* state, \
+ TENSOR_TYPE* t) { \
+ return TENSOR_TYPE##_getDevice(state, t); \
+} \
+ \
+void \
+TensorUtils<TENSOR_TYPE>::copyIgnoringOverlaps(THCState* state, \
+ TENSOR_TYPE* dst, \
+ TENSOR_TYPE* src) { \
+ return TENSOR_TYPE##_copyIgnoringOverlaps(state, dst, src); \
+} \
+ \
+bool \
+TensorUtils<TENSOR_TYPE>::overlappingIndices(THCState* state, \
+ TENSOR_TYPE* t) { \
+ /* In this function, we don't care about permutations of the */ \
+ /* size/stride arrays (transpositions). */ \
+ /* We order the size/stride arrays by stride, skipping dimensions of */ \
+ /* size 1. Strides of dimensions of size 1 don't matter, since there */ \
+ /* is only one addressing point in them. */ \
+ /* In this reordered view, the tensor is contiguous if */ \
+ /* stride[dim] == size[dim + 1] * stride[dim + 1] for all `dim`. */ \
+ /* The tensor has holes if */ \
+ /* stride[dim] > size[dim + 1] * stride[dim + 1] for one or more */ \
+ /* `dim`. */ \
+ /* The tensor has overlaps if */ \
+ /* stride[dim] < size[dim + 1] * stride[dim + 1] for one or more */ \
+ /* `dim`, or the innermost stride is 0. */ \
+ \
+ /* Extract size/stride arrays; only consider size >1 dims. */ \
+ SizeAndStride info[MAX_CUTORCH_DIMS]; \
+ \
+ int dims = TensorUtils<TENSOR_TYPE>::getDims(state, t); \
+ int nonSize1Dims = 0; \
+ for (int i = 0; i < dims; ++i) { \
+ long size = TensorUtils<TENSOR_TYPE>::getSize(state, t, i); \
+ if (size > 1) { \
+ info[nonSize1Dims].size = size; \
+ info[nonSize1Dims].stride = \
+ TensorUtils<TENSOR_TYPE>::getStride(state, t, i); \
+ ++nonSize1Dims; \
+ } \
+ } \
+ \
+ if (nonSize1Dims == 0) { \
+ /* no overlap */ \
+ return false; \
+ } \
+ \
+ /* Ascending order (innermost dimension in sorted view is at [0]) */ \
+ qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride); \
+ \
+ /* Base case: innermost dimension must have stride >= 1 */ \
+ if (info[nonSize1Dims - 1].stride < 1) { \
+ return true; \
+ } \
+ \
+ /* Subsequent dimensions, if any */ \
+ for (int i = nonSize1Dims - 2; i >= 0; --i) { \
+ if (info[i].stride < info[i + 1].size * info[i + 1].stride) { \
+ /* There are overlaps */ \
+ return true; \
+ } \
+ } \
+ \
+ /* Tensor has holes or is contiguous */ \
+ return false; \
+} \
+ \
+bool \
+TensorUtils<TENSOR_TYPE>::canUse32BitIndexMath(THCState* state, \
+ TENSOR_TYPE* t) { \
+ long elements = TensorUtils<TENSOR_TYPE>::getNumElements(state, t); \
+ if (elements >= UINT_MAX) { \
+ return false; \
+ } \
+ \
+ long offset = 0; \
+ long linearId = elements - 1; \
+ \
+ for (int i = TensorUtils<TENSOR_TYPE>::getDims(state, t) - 1; i >= 0; --i) { \
+ long curDimIndex = \
+ linearId % TensorUtils<TENSOR_TYPE>::getSize(state, t, i); \
+ long curDimOffset = curDimIndex * \
+ TensorUtils<TENSOR_TYPE>::getStride(state, t, i); \
+ offset += curDimOffset; \
+ linearId /= TensorUtils<TENSOR_TYPE>::getSize(state, t, i); \
+ } \
+ \
+ if (offset >= UINT_MAX) { \
+ return false; \
+ } \
+ \
+ return true; \
+}
+
+IMPL_TENSOR_UTILS(THCudaByteTensor, unsigned char)
+IMPL_TENSOR_UTILS(THCudaCharTensor, char)
+IMPL_TENSOR_UTILS(THCudaShortTensor, short)
+IMPL_TENSOR_UTILS(THCudaIntTensor, int)
+IMPL_TENSOR_UTILS(THCudaLongTensor, long)
+IMPL_TENSOR_UTILS(THCudaTensor, float)
+IMPL_TENSOR_UTILS(THCudaDoubleTensor, double)
+
+#ifdef CUDA_HALF_TENSOR
+IMPL_TENSOR_UTILS(THCudaHalfTensor, half);
+#endif
+
+#undef IMPL_TENSOR_UTILS
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
new file mode 100644
index 0000000..4456f47
--- /dev/null
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -0,0 +1,184 @@
+#ifndef THC_TENSOR_TYPE_UTILS_INC
+#define THC_TENSOR_TYPE_UTILS_INC
+
+#include <cuda.h>
+#include <assert.h>
+#include "THCGeneral.h"
+#include "THCHalf.h"
+#include "THCTensor.h"
+#include "THCTensorInfo.cuh"
+
+/// A utility for accessing THCuda*Tensor types in a generic manner
+
+/// Equivalent to C++11's type_traits std::is_same; used for comparing
+/// equality of types. Don't assume the existence of C++11
+template <typename T, typename U>
+struct SameType {
+ static const bool same = false;
+};
+
+template <typename T>
+struct SameType<T, T> {
+ static const bool same = true;
+};
+
+template <typename T, typename U>
+bool isSameType() {
+ return SameType<T, U>::same;
+}
+
+template <typename TensorType>
+struct TensorUtils {
+};
+
+#define TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE) \
+ template <> \
+ struct TensorUtils<TENSOR_TYPE> { \
+ typedef DATA_TYPE DataType; \
+ \
+ static TENSOR_TYPE* newTensor(THCState* state); \
+ static TENSOR_TYPE* newContiguous(THCState* state, TENSOR_TYPE* t); \
+ static void retain(THCState* state, TENSOR_TYPE* t); \
+ static void free(THCState* state, TENSOR_TYPE* t); \
+ static void freeCopyTo(THCState* state, TENSOR_TYPE* src, \
+ TENSOR_TYPE* dst); \
+ static void resizeAs(THCState* state, TENSOR_TYPE* dst, \
+ TENSOR_TYPE* src); \
+ static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t); \
+ static long getNumElements(THCState* state, TENSOR_TYPE* t); \
+ static long getSize(THCState* state, TENSOR_TYPE* t, int dim); \
+ static long getStride(THCState* state, TENSOR_TYPE* t, int dim); \
+ static int getDims(THCState* state, TENSOR_TYPE* t); \
+ static bool isContiguous(THCState* state, TENSOR_TYPE* t); \
+ static int getDevice(THCState* state, TENSOR_TYPE* t); \
+ static void copyIgnoringOverlaps(THCState* state, \
+ TENSOR_TYPE* dst, TENSOR_TYPE* src); \
+ /* Determines if the given tensor has overlapping data points (i.e., */ \
+ /* is there more than one index into the tensor that references */ \
+ /* the same piece of data)? */ \
+ static bool overlappingIndices(THCState* state, TENSOR_TYPE* t); \
+ /* Can we use 32 bit math for indexing? */ \
+ static bool canUse32BitIndexMath(THCState* state, TENSOR_TYPE* t); \
+ }
+
+TENSOR_UTILS(THCudaByteTensor, unsigned char);
+TENSOR_UTILS(THCudaCharTensor, char);
+TENSOR_UTILS(THCudaShortTensor, short);
+TENSOR_UTILS(THCudaIntTensor, int);
+TENSOR_UTILS(THCudaLongTensor, long);
+TENSOR_UTILS(THCudaTensor, float);
+TENSOR_UTILS(THCudaDoubleTensor, double);
+
+#ifdef CUDA_HALF_TENSOR
+TENSOR_UTILS(THCudaHalfTensor, half);
+#endif
+
+#undef TENSOR_UTILS
+
+template <typename TensorType, typename IndexType>
+TensorInfo<typename TensorUtils<TensorType>::DataType, IndexType>
+getTensorInfo(THCState* state, TensorType* t) {
+ IndexType sz[MAX_CUTORCH_DIMS];
+ IndexType st[MAX_CUTORCH_DIMS];
+
+ int dims = TensorUtils<TensorType>::getDims(state, t);
+ for (int i = 0; i < dims; ++i) {
+ sz[i] = TensorUtils<TensorType>::getSize(state, t, i);
+ st[i] = TensorUtils<TensorType>::getStride(state, t, i);
+ }
+
+ return TensorInfo<typename TensorUtils<TensorType>::DataType, IndexType>(
+ TensorUtils<TensorType>::getData(state, t), dims, sz, st);
+}
+
+/// `half` has some type conversion issues associated with it, since it
+/// is a struct without a constructor/implicit conversion constructor.
+/// We use this to convert scalar values to the given type that the
+/// tensor expects.
+template <typename In, typename Out>
+struct ScalarConvert {
+ static __host__ __device__ Out to(const In v) { return (Out) v; }
+};
+
+template <typename T>
+struct ScalarNegate {
+ static __host__ __device__ T to(const T v) { return -v; }
+};
+
+template <typename T>
+struct ScalarInv {
+ static __host__ __device__ T to(const T v) { return ((T) 1) / v; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+
+template <typename Out>
+struct ScalarConvert<half, Out> {
+ static __host__ __device__ Out to(const half v) {
+#ifdef __CUDA_ARCH__
+ return (Out) __half2float(v);
+#else
+ return (Out) THC_half2float(v);
+#endif
+ }
+};
+
+template <typename In>
+struct ScalarConvert<In, half> {
+ static __host__ __device__ half to(const In v) {
+#ifdef __CUDA_ARCH__
+ return __float2half((float) v);
+#else
+ return THC_float2half((float) v);
+#endif
+ }
+};
+
+template <>
+struct ScalarConvert<half, half> {
+ static __host__ __device__ half to(const half v) {
+ return v;
+ }
+};
+
+template <>
+struct ScalarNegate<half> {
+ static __host__ __device__ half to(const half v) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+ return __hneg(v);
+#else
+ return __float2half(-__half2float(v));
+#endif
+#else
+ half out = v;
+ out.x ^= 0x8000; // toggle sign bit
+ return out;
+#endif
+ }
+};
+
+template <>
+struct ScalarInv<half> {
+ static __host__ __device__ half to(const half v) {
+#ifdef __CUDA_ARCH__
+ return __float2half(1.0f / __half2float(v));
+#else
+ float fv = THC_half2float(v);
+ fv = 1.0f / fv;
+ return THC_float2half(fv);
+#endif
+ }
+};
+
+inline bool operator==(half a, half b) {
+ return a.x == b.x;
+}
+
+inline bool operator!=(half a, half b) {
+ return a.x != b.x;
+}
+
+#endif // CUDA_HALF_TENSOR
+
+#endif // THC_TENSOR_TYPE_UTILS_INC
diff --git a/lib/THC/generic/THCStorage.c b/lib/THC/generic/THCStorage.c
index d8fec77..61ba125 100644
--- a/lib/THC/generic/THCStorage.c
+++ b/lib/THC/generic/THCStorage.c
@@ -17,29 +17,18 @@ int THCStorage_(elementSize)(THCState *state)
return sizeof(real);
}
-void THCStorage_(set)(THCState *state, THCStorage *self, long index, hostreal _value)
+void THCStorage_(set)(THCState *state, THCStorage *self, long index, real value)
{
THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
- real value = hostrealToReal(_value);
THCudaCheck(cudaMemcpy(self->data + index, &value, sizeof(real), cudaMemcpyHostToDevice));
}
-hostreal THCStorage_(get)(THCState *state, const THCStorage *self, long index)
+real THCStorage_(get)(THCState *state, const THCStorage *self, long index)
{
THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
-#ifndef THC_REAL_IS_HALF
real value;
THCudaCheck(cudaMemcpy(&value, self->data + index, sizeof(real), cudaMemcpyDeviceToHost));
- return realToHostreal(value);
-#else
- float *ret_d;
- float ret;
- THCudaCheck(THCudaMalloc(state, (void**)&ret_d, sizeof(float)));
- THCHalf2Float(state, ret_d, self->data + index, 1);
- THCudaCheck(cudaMemcpy(&ret, ret_d, sizeof(float), cudaMemcpyDeviceToHost));
- THCudaFree(state, ret_d);
- return ret;
-#endif
+ return value;
}
THCStorage* THCStorage_(new)(THCState *state)
@@ -80,14 +69,14 @@ THCStorage* THCStorage_(newWithSize)(THCState *state, long size)
}
}
-THCStorage* THCStorage_(newWithSize1)(THCState *state, hostreal data0)
+THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0)
{
THCStorage *self = THCStorage_(newWithSize)(state, 1);
THCStorage_(set)(state, self, 0, data0);
return self;
}
-THCStorage* THCStorage_(newWithSize2)(THCState *state, hostreal data0, hostreal data1)
+THCStorage* THCStorage_(newWithSize2)(THCState *state, real data0, real data1)
{
THCStorage *self = THCStorage_(newWithSize)(state, 2);
THCStorage_(set)(state, self, 0, data0);
@@ -95,7 +84,7 @@ THCStorage* THCStorage_(newWithSize2)(THCState *state, hostreal data0, hostreal
return self;
}
-THCStorage* THCStorage_(newWithSize3)(THCState *state, hostreal data0, hostreal data1, hostreal data2)
+THCStorage* THCStorage_(newWithSize3)(THCState *state, real data0, real data1, real data2)
{
THCStorage *self = THCStorage_(newWithSize)(state, 3);
THCStorage_(set)(state, self, 0, data0);
@@ -104,7 +93,7 @@ THCStorage* THCStorage_(newWithSize3)(THCState *state, hostreal data0, hostreal
return self;
}
-THCStorage* THCStorage_(newWithSize4)(THCState *state, hostreal data0, hostreal data1, hostreal data2, hostreal data3)
+THCStorage* THCStorage_(newWithSize4)(THCState *state, real data0, real data1, real data2, real data3)
{
THCStorage *self = THCStorage_(newWithSize)(state, 4);
THCStorage_(set)(state, self, 0, data0);
diff --git a/lib/THC/generic/THCStorage.cu b/lib/THC/generic/THCStorage.cu
index ff0d3c9..17924f5 100644
--- a/lib/THC/generic/THCStorage.cu
+++ b/lib/THC/generic/THCStorage.cu
@@ -2,10 +2,9 @@
#define THC_GENERIC_FILE "generic/THCStorage.cu"
#else
-void THCStorage_(fill)(THCState *state, THCStorage *self, hostreal _value)
+void THCStorage_(fill)(THCState *state, THCStorage *self, real value)
{
thrust::device_ptr<real> self_data(self->data);
- real value = hostrealToReal(_value);
thrust::fill(
#if CUDA_VERSION >= 7000
thrust::cuda::par.on(THCState_getCurrentStream(state)),
diff --git a/lib/THC/generic/THCStorage.h b/lib/THC/generic/THCStorage.h
index f161d5f..a8c5f5f 100644
--- a/lib/THC/generic/THCStorage.h
+++ b/lib/THC/generic/THCStorage.h
@@ -23,15 +23,15 @@ THC_API long THCStorage_(size)(THCState *state, const THCStorage*);
THC_API int THCStorage_(elementSize)(THCState *state);
/* slow access -- checks everything */
-THC_API void THCStorage_(set)(THCState *state, THCStorage*, long, hostreal);
-THC_API hostreal THCStorage_(get)(THCState *state, const THCStorage*, long);
+THC_API void THCStorage_(set)(THCState *state, THCStorage*, long, real);
+THC_API real THCStorage_(get)(THCState *state, const THCStorage*, long);
THC_API THCStorage* THCStorage_(new)(THCState *state);
THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, long size);
-THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, hostreal);
-THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, hostreal, hostreal);
-THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, hostreal, hostreal, hostreal);
-THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, hostreal, hostreal, hostreal, hostreal);
+THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, real);
+THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, real, real);
+THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, real, real, real);
+THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, real, real, real, real);
THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, long size, int shared);
/* takes ownership of data */
@@ -49,6 +49,6 @@ THC_API void THCStorage_(retain)(THCState *state, THCStorage *storage);
THC_API void THCStorage_(free)(THCState *state, THCStorage *storage);
THC_API void THCStorage_(resize)(THCState *state, THCStorage *storage, long size);
-THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, hostreal value);
+THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, real value);
#endif
diff --git a/lib/THC/generic/THCStorageCopy.cu b/lib/THC/generic/THCStorageCopy.cu
index b00c122..298f717 100644
--- a/lib/THC/generic/THCStorageCopy.cu
+++ b/lib/THC/generic/THCStorageCopy.cu
@@ -62,7 +62,7 @@ THC_CUDA_STORAGE_IMPLEMENT_COPY(Long,Long)
THC_CUDA_STORAGE_IMPLEMENT_COPY(Float,) // i.e. float
THC_CUDA_STORAGE_IMPLEMENT_COPY(Double,Double)
-#if CUDA_VERSION >= 7050
+#ifdef CUDA_HALF_TENSOR
#define FLOAT_COPY(TYPE) TH_CONCAT_3(TH, CReal, Storage_copyCudaFloat)
void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaHalfStorage *src)
{
@@ -77,7 +77,7 @@ void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaH
}
}
#undef FLOAT_COPY
-#endif // CUDA_VERSION >= 7050
+#endif // CUDA_HALF_TENSOR
#undef THC_CUDA_STORAGE_IMPLEMENT_COPY
diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h
index 3313290..c3e5601 100644
--- a/lib/THC/generic/THCStorageCopy.h
+++ b/lib/THC/generic/THCStorageCopy.h
@@ -21,7 +21,7 @@ THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, stru
THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src);
THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src);
THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src);
-#if CUDA_VERSION >= 7050
+#ifdef CUDA_HALF_TENSOR
THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src);
#endif
diff --git a/lib/THC/generic/THCTensor.c b/lib/THC/generic/THCTensor.c
index 2f87f1a..e18044d 100644
--- a/lib/THC/generic/THCTensor.c
+++ b/lib/THC/generic/THCTensor.c
@@ -730,56 +730,56 @@ void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, lon
self->nDimension = 0;
}
-void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, hostreal value)
+void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, real value)
{
THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value);
}
-hostreal THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0)
+real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0)
{
THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
}
-void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, hostreal value)
+void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, real value)
{
THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value);
}
-hostreal THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1)
+real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1)
{
THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]);
}
-void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, hostreal value)
+void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, real value)
{
THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value);
}
-hostreal THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2)
+real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2)
{
THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]);
}
-void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, hostreal value)
+void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, real value)
{
THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value);
}
-hostreal THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3)
+real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3)
{
THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
diff --git a/lib/THC/generic/THCTensor.h b/lib/THC/generic/THCTensor.h
index 8e4d1a4..175eaee 100644
--- a/lib/THC/generic/THCTensor.h
+++ b/lib/THC/generic/THCTensor.h
@@ -112,15 +112,15 @@ THC_API void THCTensor_(free)(THCState *state, THCTensor *self);
THC_API void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst);
/* Slow access methods [check everything] */
-THC_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, hostreal value);
-THC_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, hostreal value);
-THC_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, hostreal value);
-THC_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, hostreal value);
-
-THC_API hostreal THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0);
-THC_API hostreal THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1);
-THC_API hostreal THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2);
-THC_API hostreal THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3);
+THC_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, real value);
+THC_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, real value);
+THC_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, real value);
+THC_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, real value);
+
+THC_API real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0);
+THC_API real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1);
+THC_API real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2);
+THC_API real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3);
/* CUDA-specific functions */
THC_API cudaTextureObject_t THCTensor_(getTextureObject)(THCState *state, THCTensor *self);
diff --git a/lib/THC/generic/THCTensorCopy.c b/lib/THC/generic/THCTensorCopy.c
index 68f57bf..e0bcadd 100644
--- a/lib/THC/generic/THCTensorCopy.c
+++ b/lib/THC/generic/THCTensorCopy.c
@@ -126,7 +126,6 @@ IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Long)
IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Float)
IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Double)
-// FIXME: add within-CUDA conversions
void THCTensor_(copyCuda)(THCState *state, THCTensor *self, THCTensor *src)
{
THCTensor_(copy)(state, self, src);
diff --git a/lib/THC/generic/THCTensorCopy.cu b/lib/THC/generic/THCTensorCopy.cu
index c5768e2..4198025 100644
--- a/lib/THC/generic/THCTensorCopy.cu
+++ b/lib/THC/generic/THCTensorCopy.cu
@@ -4,251 +4,44 @@
THC_API void
THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) {
- long totalElements = THCTensor_(nElement)(state, dst);
-
- THArgCheck(totalElements == THCTensor_(nElement)(state, src), 2,
- "sizes do not match");
-
- if (THCTensor_(nDimension)(state, dst) == 0) {
- // Zero-dim tensor; copy nothing
- return;
- }
-
- // We can memcpy the memory if:
- // -both tensors are contiguous; or,
- // -there is only one element to copy; or,
- // -FIXME: if both tensors have matching size and stride arrays, and no
- // holes within (in other words, there is some permutation that can be applied
- // to the size/strides such that the resulting tensor is contiguous).
- bool srcContig = THCTensor_(isContiguous)(state, src);
- bool dstContig = THCTensor_(isContiguous)(state, dst);
- bool memcpyEligible = (srcContig && dstContig) || (totalElements == 1);
-
- int srcDev = THCTensor_(getDevice)(state, src);
- int dstDev = THCTensor_(getDevice)(state, dst);
- int oldDev = curGPU();
-
- // We always perform the copy on the source device, using the
- // current stream on the source device.
- // If the copy is on the default stream, then we fully synchronize
- // both src and dst's default streams for completion of the
- // copy. We have to explicitly do this for non-contig copies.
- // This mimics the behavior of cross-device cudaMemcpyAsync on
- // the default stream.
- // If the copy is not on the default stream, then it is up to the
- // user to add needed synchronization on the dst device, since the
- // stream on the dst device that wishes to synchronize may not be
- // the same index as the one on the src device.
- int copyStreamIndex =
- THCState_getCurrentStreamIndex(state);
- cudaStream_t copyStream =
- THCState_getDeviceStream(state, srcDev, copyStreamIndex);
-
- if (srcDev != dstDev && copyStreamIndex == 0) {
- // This is a cross-device copy on the default stream. We perform a
- // two-way barrier between both devices' default streams before
- // the copy. This ensures that any write-after-write and
- // write-after-read dependencies on the destination side are
- // handled, so that no one is operating on the dst memory when
- // we perform the copy.
- // src waits on dst barrier (src already waits on src)
- cudaEvent_t dstReady;
- THCudaCheck(cudaSetDevice(dstDev));
- THCudaCheck(cudaEventCreateWithFlags(&dstReady, cudaEventDisableTiming));
- THCudaCheck(cudaEventRecord(dstReady, NULL));
-
- THCudaCheck(cudaSetDevice(srcDev));
- THCudaCheck(cudaStreamWaitEvent(NULL, dstReady, 0));
- THCudaCheck(cudaEventDestroy(dstReady));
- } else if (srcDev != oldDev) {
- THCudaCheck(cudaSetDevice(srcDev));
- }
-
- // We are now on srcDev
- if (memcpyEligible) {
- // Perform the copy
- THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, dst),
- THCTensor_(data)(state, src),
- totalElements * sizeof(real),
- cudaMemcpyDeviceToDevice,
- copyStream));
- } else {
-#if defined(THC_REAL_IS_FLOAT)
- // Non-contiguous copy
-
- // We avoid creating temporary memory copies if possible.
- // If both src and dst are on the same device, or if they are on
- // different devices and p2p access is enabled, perform the copy
- // by a pointwise copy kernel.
- // Otherwise, we'll have to make contiguous (which will in fact
- // invoke copy() again), and then perform the copy.
- // FIXME: might want to consider only running the pointwise kernel
- // if both src and dst innermost dimensions are contiguous. If
- // they are not, then taking the hit of the memory allocation/free
- // might be worth it to avoid non-coalesced reads or writes.
-
- // A device always has access to itself, so this also handles the
- // case srcDev == dstDev
- if (THCState_getPeerToPeerAccess(state, srcDev, dstDev)) {
- // Make sure we have the current stream set in THCState, since
- // pointwise uses that
- if (srcDev != oldDev) {
- THCState_setStream(state, srcDev, copyStreamIndex);
- }
-
- bool succ =
- THCudaTensor_pointwiseApply2(state, dst, src, CopyOp<float>());
- THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
-
- // Restore prior THCState stream
- if (srcDev != oldDev) {
- THCState_setStream(state, oldDev, copyStreamIndex);
- }
- } else {
- // GPUs can't access each other directly; fall back to
- // newContiguous and memcpy
- THCudaTensor* srcContig = THCudaTensor_newContiguous(state, src);
- THCudaTensor* dstContig = dst;
-
- if (!THCudaTensor_isContiguous(state, dst)) {
- // We are copying over the contents of dst, so we don't need
- // to preserve its values. We just need a destination tensor
- // the same size as dst.
-
- // Allocate the tensor on the new device
- THCudaCheck(cudaSetDevice(dstDev));
-
- dstContig = THCudaTensor_new(state);
- THCudaTensor_resizeAs(state, dstContig, dst);
-
- THCudaCheck(cudaSetDevice(srcDev));
- }
-
- THCudaCheck(cudaMemcpyAsync(THCudaTensor_data(state, dstContig),
- THCudaTensor_data(state, srcContig),
- totalElements * sizeof(float),
- cudaMemcpyDeviceToDevice,
- copyStream));
-
- THCudaTensor_free(state, srcContig);
-
- if (dst != dstContig) {
- THCudaTensor_freeCopyTo(state, dstContig, dst);
- }
- }
-#else
-#define STRINGIFY(x) #x
- THError("Non-contiguous copy not implemented for Cuda%sTensor", STRINGIFY(Real));
-#undef STRINGIFY
-#endif
- }
-
- if (srcDev != dstDev && copyStreamIndex == 0) {
- // dst waits on src barrier (dst already waits on dst). We cannot
- // operate on dst's copy until the copy is complete.
-
- // Still on srcDev, record default stream event
- cudaEvent_t srcReady;
- THCudaCheck(cudaEventCreateWithFlags(&srcReady, cudaEventDisableTiming));
- THCudaCheck(cudaEventRecord(srcReady, NULL));
-
- THCudaCheck(cudaSetDevice(dstDev));
- THCudaCheck(cudaStreamWaitEvent(NULL, srcReady, 0));
- THCudaCheck(cudaEventDestroy(srcReady));
-
- // We are now on dstDev (right above). Restore prior device from dst
- if (dstDev != oldDev) {
- THCudaCheck(cudaSetDevice(oldDev));
- }
- } else {
- // We are still on srcDev. Restore prior device from src
- if (srcDev != oldDev) {
- THCudaCheck(cudaSetDevice(oldDev));
- }
- }
+ THC_copyTensor<THCTensor, THCTensor>(state, dst, src);
+}
- cudaError errcode = cudaGetLastError();
- if (errcode != cudaSuccess) {
- THError(cudaGetErrorString(errcode));
- }
+THC_API void
+THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) {
+ // Called when we are copying into an overlapping index `dst`, but
+ // we don't care which writer wins. Hacky but it works.
+ // This is itself invoked by pointwiseApply2 / THCTensor_copy in
+ // case that there are write overlaps.
+ // FIXME: really, overlapping writes should be illegal/an error in Torch
+ THC_pointwiseApply2(
+ state, dst, src,
+ CopyOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>(),
+ ReadOnly, /* ignore overwrites */
+ ReadOnly);
}
-// conversions are mediated by the CPU
-// yes, this is slow; feel free to write CUDA kernels for this
-#ifndef THC_REAL_IS_HALF
-#define THC_CUDA_TENSOR_IMPLEMENT_COPY(TYPEC,TYPECUDA) \
- void THCTensor_(copyCuda##TYPEC)(THCState *state, THCTensor *self, struct THCuda##TYPECUDA##Tensor *src) \
- { \
- if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \
- THCTensor_(copy)(state, self, (THCTensor*) src); /* cast just removes compiler warning */ \
- } else { \
- THArgCheck(THCTensor_(nElement)(state, self) == THCuda##TYPECUDA##Tensor_nElement(state, src), 2, "size does not match"); \
- THLongStorage *size = THCuda##TYPECUDA##Tensor_newSizeOf(state, src); \
- TH##TYPEC##Tensor *buffer1 = TH##TYPEC##Tensor_newWithSize(size, NULL); \
- THTensor *buffer2 = THTensor_(newWithSize)(size, NULL); \
- TH##TYPEC##Tensor_copyCuda(state, buffer1, src); \
- THTensor_(copy##TYPEC)(buffer2, buffer1); \
- THCTensor_(copyCPU)(state, self, buffer2); \
- THLongStorage_free(size); \
- TH##TYPEC##Tensor_free(buffer1); \
- THTensor_(free)(buffer2); \
- } \
- }
-#else
-#define THC_CUDA_TENSOR_IMPLEMENT_COPY(TYPEC,TYPECUDA) \
- void THCTensor_(copyCuda##TYPEC)(THCState *state, THCTensor *self, struct THCuda##TYPECUDA##Tensor *src) \
- { \
- THArgCheck(THCTensor_(nElement)(state, self) == THCuda##TYPECUDA##Tensor_nElement(state, src), 2, "size does not match"); \
- if (THCTypeIdx_(TYPEC) == THCTypeIdxFloat) { \
- THCudaTensor *csrc = THCudaTensor_newContiguous(state, (THCudaTensor*) src); /* cast removes compiler error */ \
- THCFloat2Half(state, \
- THCTensor_(data)(state, self), \
- THCudaTensor_data(state, csrc), \
- THCudaTensor_nElement(state, csrc)); \
- THCudaTensor_free(state, csrc); \
- } else { \
- THLongStorage *size = THCuda##TYPECUDA##Tensor_newSizeOf(state, src); \
- THCudaTensor *buffer = THCudaTensor_newWithSize(state, size, NULL); \
- THCudaTensor_copyCuda##TYPEC(state, buffer, src); \
- THCFloat2Half(state, \
- THCTensor_(data)(state, self), \
- THCudaTensor_data(state, buffer), \
- THCudaTensor_nElement(state, buffer)); \
- THCudaTensor_free(state, buffer); \
- THLongStorage_free(size); \
- } \
+#define IMPLEMENT_THC_CUDA_TENSOR_COPY(TYPEC, TYPECUDA) \
+ THC_API void \
+ THCTensor_(copyCuda##TYPEC)(THCState *state, \
+ THCTensor *self, \
+ THCuda##TYPECUDA##Tensor *src) { \
+ THC_copyTensor<THCTensor, THCuda##TYPECUDA##Tensor>(state, self, src); \
}
-#endif
-
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Byte,Byte)
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Char,Char)
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Short,Short)
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Int,Int)
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Long,Long)
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Float,) // i.e. float
-THC_CUDA_TENSOR_IMPLEMENT_COPY(Double,Double)
-#if CUDA_VERSION >= 7050
-#define FLOAT_COPY(TYPE) TH_CONCAT_3(TH, CReal, Tensor_copyCudaFloat)
-void THCTensor_(copyCudaHalf)(THCState *state, THCTensor *self, struct THCudaHalfTensor *src)
-{
- if(THCTypeIdx_(Real) == THCTypeIdxHalf) {
- THCTensor_(copy)(state, self, (THCTensor*) src); /* cast removes compiler error */
- } else {
- THArgCheck(THCTensor_(nElement)(state, self) == THCudaHalfTensor_nElement(state, src), 2, "size does not match");
- src = THCudaHalfTensor_newContiguous(state, src);
- THLongStorage *size = THCudaHalfTensor_newSizeOf(state, src);
- THCudaTensor *buffer = THCudaTensor_newWithSize(state, size, NULL);
- THCHalf2Float(state, THCudaTensor_data(state, buffer), THCudaHalfTensor_data(state, src), THCudaHalfTensor_nElement(state, src));
- FLOAT_COPY(Real)(state, self, buffer);
- THCudaTensor_free(state, buffer);
- THCudaHalfTensor_free(state, src);
- THLongStorage_free(size);
- }
-}
-#undef FLOAT_COPY
-#endif // CUDA_VERSION >= 7050
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Byte, Byte)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Char, Char)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Short, Short)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Int, Int)
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Long, Long)
+// THCudaTensor aka the non-existent THCudaFloatTensor
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Float, )
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Double, Double)
+#ifdef CUDA_HALF_TENSOR
+IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half)
+#endif
-#undef THC_CUDA_TENSOR_IMPLEMENT_COPY
+#undef IMPLEMENT_THC_CUDA_TENSOR_COPY
#endif
diff --git a/lib/THC/generic/THCTensorCopy.h b/lib/THC/generic/THCTensorCopy.h
index 8a2837f..3c7649b 100644
--- a/lib/THC/generic/THCTensorCopy.h
+++ b/lib/THC/generic/THCTensorCopy.h
@@ -3,6 +3,7 @@
#else
THC_API void THCTensor_(copy)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(copyIgnoringOverlaps)(THCState *state, THCTensor *self, THCTensor *src);
THC_API void THCTensor_(copyByte)(THCState *state, THCTensor *self, THByteTensor *src);
THC_API void THCTensor_(copyChar)(THCState *state, THCTensor *self, THCharTensor *src);
THC_API void THCTensor_(copyShort)(THCState *state, THCTensor *self, THShortTensor *src);
@@ -18,7 +19,7 @@ THC_API void THCTensor_(copyCudaInt)(THCState *state, THCTensor *storage, struct
THC_API void THCTensor_(copyCudaLong)(THCState *state, THCTensor *storage, struct THCudaLongTensor *src);
THC_API void THCTensor_(copyCudaFloat)(THCState *state, THCTensor *storage, struct THCudaTensor *src);
THC_API void THCTensor_(copyCudaDouble)(THCState *state, THCTensor *storage, struct THCudaDoubleTensor *src);
-#if CUDA_VERSION >= 7050
+#ifdef CUDA_HALF_TENSOR
THC_API void THCTensor_(copyCudaHalf)(THCState *state, THCTensor *storage, struct THCudaHalfTensor *src);
#endif
diff --git a/lib/THC/generic/THCTensorMath.cu b/lib/THC/generic/THCTensorMath.cu
new file mode 100644
index 0000000..a0e550a
--- /dev/null
+++ b/lib/THC/generic/THCTensorMath.cu
@@ -0,0 +1,68 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMath.cu"
+#else
+
+THC_API void
+THCTensor_(fill)(THCState* state, THCTensor *self_, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 1, self_));
+
+ if (!THC_pointwiseApply1(
+ state, self_, TensorFillOp<real>(value))) {
+ THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(zero)(THCState *state, THCTensor *self_)
+{
+ THAssert(THCTensor_(checkGPU)(state, 1, self_));
+ if (THCTensor_(isContiguous)(state, self_)) {
+ THCudaCheck(cudaMemsetAsync(THCTensor_(data)(state, self_),
+ 0,
+ sizeof(real) * THCTensor_(nElement)(state, self_),
+ THCState_getCurrentStream(state)));
+ } else {
+ if (!THC_pointwiseApply1(
+ state, self_,
+ TensorFillOp<real>(ScalarConvert<int, real>::to(0)))) {
+ THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size)
+{
+ THAssert(THCTensor_(checkGPU)(state, 1, r_));
+ THCTensor_(resize)(state, r_, size, NULL);
+ THCTensor_(zero)(state, r_);
+}
+
+THC_API void
+THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size)
+{
+ THAssert(THCTensor_(checkGPU)(state, 1, r_));
+ THCTensor_(resize)(state, r_, size, NULL);
+ THCTensor_(fill)(state, r_, ScalarConvert<int, real>::to(1));
+}
+
+THC_API void
+THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, r_, t));
+ THCTensor_(resize)(state, r_, size, NULL);
+ THCTensor_(copy)(state, r_, t);
+}
+
+long
+THCTensor_(numel)(THCState *state, THCTensor *t)
+{
+ return THCTensor_(nElement)(state, t);
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMath.h b/lib/THC/generic/THCTensorMath.h
new file mode 100644
index 0000000..5c9e66d
--- /dev/null
+++ b/lib/THC/generic/THCTensorMath.h
@@ -0,0 +1,13 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMath.h"
+#else
+
+THC_API void THCTensor_(fill)(THCState *state, THCTensor *self, real value);
+THC_API void THCTensor_(zero)(THCState *state, THCTensor *self);
+
+THC_API void THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size);
+THC_API void THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size);
+THC_API void THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size);
+THC_API long THCTensor_(numel)(THCState *state, THCTensor *t);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPairwise.cu b/lib/THC/generic/THCTensorMathPairwise.cu
new file mode 100644
index 0000000..4a5c09d
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPairwise.cu
@@ -0,0 +1,74 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.cu"
+#else
+
+THC_API void
+THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ if (self_ == src_) {
+ if (!THC_pointwiseApply1(state, self_, TensorAddConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src_);
+
+ if (!THC_pointwiseApply2(state, self_, src_, TensorAddConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+ THCTensor_(add)(state, self_, src_, ScalarNegate<real>::to(value));
+}
+
+THC_API void
+THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ if (self_ == src_) {
+ if (!THC_pointwiseApply1(state, self_, TensorMulConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src_);
+
+ if (!THC_pointwiseApply2(state, self_, src_, TensorMulConstantOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src_));
+ THArgCheck(value != ScalarConvert<int, real>::to(0), 3, "divide by zero");
+
+ if (self_ == src_) {
+ if (!THC_pointwiseApply1(state, self_,
+ TensorMulConstantOp<real>(
+ ScalarInv<real>::to(value)))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src_);
+
+ if (!THC_pointwiseApply2(state, self_, src_,
+ TensorMulConstantOp<real>(
+ ScalarInv<real>::to(value)))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPairwise.h b/lib/THC/generic/THCTensorMathPairwise.h
new file mode 100644
index 0000000..9a83293
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPairwise.h
@@ -0,0 +1,10 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.h"
+#else
+
+THC_API void THCTensor_(add)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(sub)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, real value);
+THC_API void THCTensor_(div)(THCState *state, THCTensor *self, THCTensor *src, real value);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu
new file mode 100644
index 0000000..b6679cd
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPointwise.cu
@@ -0,0 +1,157 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.cu"
+#else
+
+THC_API void
+THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THArgCheck(THCTensor_(nElement)(state, src1) ==
+ THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+ if (self_ == src1) {
+ if (value == ScalarConvert<int, real>::to(1)) {
+ // self += src2
+ if (!THC_pointwiseApply2(state, self_, src2, TensorAddOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ // self += value * src2
+ if (!THC_pointwiseApply2(state, self_, src2, TensorCAddOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src1);
+
+ if (value == ScalarConvert<int, real>::to(1)) {
+ // self = src1 + src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ // self = src1 + value * src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCAddOp<real>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THArgCheck(THCTensor_(nElement)(state, src1) ==
+ THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+ if (self_ == src1) {
+ if (value == ScalarConvert<int, real>::to(1)) {
+ // self -= src2
+ if (!THC_pointwiseApply2(state, self_, src2, TensorSubOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ // self += -value * src2
+ if (!THC_pointwiseApply2(state, self_, src2,
+ TensorCAddOp<real>(
+ ScalarNegate<real>::to(value)))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src1);
+
+ if (value == ScalarConvert<int, real>::to(1)) {
+ // self = src1 - src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorSubOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ // self = src1 - value * src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2,
+ TensorCAddOp<real>(
+ ScalarNegate<real>::to(value)))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THArgCheck(THCTensor_(nElement)(state, src1) ==
+ THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+ if (self_ == src1) {
+ // self *= src2
+ if (!THC_pointwiseApply2(state, self_, src2, TensorMulOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src1);
+
+ // self = src1 * src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorMulOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THArgCheck(THCTensor_(nElement)(state, src1) ==
+ THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+ if (self_ == src1) {
+ // self = pow(self, src2)
+ if (!THC_pointwiseApply2(state, self_, src2, TensorCPowOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src1);
+
+ // self = pow(src1, src2)
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCPowOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THArgCheck(THCTensor_(nElement)(state, src1) ==
+ THCTensor_(nElement)(state, src2), 3, "sizes do not match");
+
+ if (self_ == src1) {
+ // self *= src2
+ if (!THC_pointwiseApply2(state, self_, src2, TensorDivOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ } else {
+ THCTensor_(resizeAs)(state, self_, src1);
+
+ // self = src1 * src2
+ if (!THC_pointwiseApply3(state, self_, src1, src2, TensorDivOp<real>())) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathPointwise.h b/lib/THC/generic/THCTensorMathPointwise.h
new file mode 100644
index 0000000..cfb3b14
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathPointwise.h
@@ -0,0 +1,11 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.h"
+#else
+
+THC_API void THCTensor_(cadd)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2);
+THC_API void THCTensor_(csub)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2);
+THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cpow)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
+
+#endif