diff options
Diffstat (limited to 'lib/THC')
45 files changed, 2479 insertions, 1735 deletions
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt index 3a035de..dfc7ec3 100644 --- a/lib/THC/CMakeLists.txt +++ b/lib/THC/CMakeLists.txt @@ -124,16 +124,21 @@ SET(src-cuda THCTensorConv.cu THCTensorRandom.cu THCTensorScatterGather.cu - THCApply.cu THCTensorSort.cu THCTensorTopK.cu + THCTensorTypeUtils.cu ) +MESSAGE(STATUS "got cuda version " ${CUDA_VERSION}) + IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) + MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor") LIST(APPEND src-cuda THCHalf.cu) LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1") SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}") -ENDIF() +ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) + MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor") +ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) CUDA_ADD_LIBRARY(THC SHARED ${src} ${src-cuda}) CUDA_ADD_CUBLAS_TO_TARGET(THC) @@ -176,6 +181,8 @@ INSTALL(FILES THCDeviceTensorUtils-inl.cuh THCGenerateAllTypes.h THCHalf.h + THCTensorInfo.cuh + THCTensorTypeUtils.cuh DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC") INSTALL(FILES @@ -191,4 +198,10 @@ INSTALL(FILES generic/THCTensorCopy.c generic/THCTensorCopy.cu generic/THCTensorCopy.h + generic/THCTensorMath.h + generic/THCTensorMath.cu + generic/THCTensorMathPairwise.h + generic/THCTensorMathPairwise.cu + generic/THCTensorMathPointwise.h + generic/THCTensorMathPointwise.cu DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC/generic") diff --git a/lib/THC/THCApply.cu b/lib/THC/THCApply.cu deleted file mode 100644 index 3ee9d51..0000000 --- a/lib/THC/THCApply.cu +++ /dev/null @@ -1,10 +0,0 @@ -#include "THCApply.cuh" - -// Implementation of copyIgnoringOverlaps, defined after pointwiseApply2. -void THCudaTensor_copyIgnoringOverlaps(THCState* state, - THCudaTensor* dst, - THCudaTensor* src) { - THCudaTensor_pointwiseApply2(state, dst, src, CopyOp<float>(), - ReadOnly, // ignore overwrites - ReadOnly); -} diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh index 707e22f..dd6d32a 100644 --- a/lib/THC/THCApply.cuh +++ b/lib/THC/THCApply.cuh @@ -3,6 +3,7 @@ #include "THCTensorCopy.h" #include "THCReduceApplyUtils.cuh" +#include "THCTensorTypeUtils.cuh" // // This file contains pointwise operation functions and kernels that @@ -12,81 +13,85 @@ // // Threads per block for our apply kernel +// FIXME: use occupancy calculator instead #define THC_APPLY_THREADS_PER_BLOCK 32 * 16 -// Called when we are copying into an overlapping index `dst`, but -// we don't care which writer wins. Hacky but it works. -THC_API void THCudaTensor_copyIgnoringOverlaps(THCState* state, - THCudaTensor* dst, - THCudaTensor* src); - -template <typename Op, typename IndexType, int ADims> +template <typename Op, + typename Ta, + typename IndexType, + int ADims> #if __CUDA_ARCH__ >= 350 __launch_bounds__(32 * 16, 4) #endif __global__ void -THCudaTensor_pointwiseApply1(TensorInfo<IndexType> a, - IndexType totalElements, - Op op) { +kernelPointwiseApply1(TensorInfo<Ta, IndexType> a, + IndexType totalElements, + Op op) { for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < totalElements; linearIndex += gridDim.x * blockDim.x) { // Convert `linearIndex` into an offset of `a` const IndexType aOffset = - IndexToOffset<IndexType, ADims>::get(linearIndex, a); + IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a); op(&a.data[aOffset]); } } -template <typename Op, typename IndexType, int ADims, int BDims> +template <typename Op, + typename Ta, typename Tb, + typename IndexType, + int ADims, int BDims> #if __CUDA_ARCH__ >= 350 __launch_bounds__(32 * 16, 4) #endif __global__ void -THCudaTensor_pointwiseApply2(TensorInfo<IndexType> a, - TensorInfo<IndexType> b, - IndexType totalElements, - Op op) { +kernelPointwiseApply2(TensorInfo<Ta, IndexType> a, + TensorInfo<Tb, IndexType> b, + IndexType totalElements, + Op op) { for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < totalElements; linearIndex += gridDim.x * blockDim.x) { // Convert `linearIndex` into an offset of `a` const IndexType aOffset = - IndexToOffset<IndexType, ADims>::get(linearIndex, a); + IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a); // Convert `linearIndex` into an offset of `b` const IndexType bOffset = - IndexToOffset<IndexType, BDims>::get(linearIndex, b); + IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b); op(&a.data[aOffset], &b.data[bOffset]); } } -template <typename Op, typename IndexType, int ADims, int BDims, int CDims> +template <typename Op, + typename Ta, typename Tb, typename Tc, + typename IndexType, + int ADims, int BDims, int CDims> #if __CUDA_ARCH__ >= 350 __launch_bounds__(32 * 16, 4) #endif __global__ void -THCudaTensor_pointwiseApply3(TensorInfo<IndexType> a, - TensorInfo<IndexType> b, - TensorInfo<IndexType> c, - IndexType totalElements, - Op op) { +kernelPointwiseApply3(TensorInfo<Ta, IndexType> a, + TensorInfo<Tb, IndexType> b, + TensorInfo<Tc, IndexType> c, + IndexType totalElements, + Op op) { for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < totalElements; linearIndex += gridDim.x * blockDim.x) { // Convert `linearIndex` into an offset of `a` const IndexType aOffset = - IndexToOffset<IndexType, ADims>::get(linearIndex, a); + IndexToOffset<Ta, IndexType, ADims>::get(linearIndex, a); // Convert `linearIndex` into an offset of `b` const IndexType bOffset = - IndexToOffset<IndexType, BDims>::get(linearIndex, b); + IndexToOffset<Tb, IndexType, BDims>::get(linearIndex, b); // Convert `linearIndex` into an offset of `c` const IndexType cOffset = - IndexToOffset<IndexType, CDims>::get(linearIndex, c); + IndexToOffset<Tc, IndexType, CDims>::get(linearIndex, c); op(&a.data[aOffset], &b.data[bOffset], &c.data[cOffset]); } @@ -116,18 +121,17 @@ inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) { return true; } -template <typename Op> -bool THCudaTensor_pointwiseApply1(THCState* state, - THCudaTensor* a, - const Op& op, - TensorArgType aType = ReadWrite) { - long totalElements = THCudaTensor_nElement(state, a); - - if (THCudaTensor_nDimension(state, a) > MAX_CUTORCH_DIMS) { +template <typename TensorTypeA, + typename Op> +bool THC_pointwiseApply1(THCState* state, + TensorTypeA* a, + const Op& op, + TensorArgType aType = ReadWrite) { + if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS) { return false; } - if (THCudaTensor_nDimension(state, a) == 0) { + if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) { // Zero-dim tensor; do nothing return true; } @@ -135,6 +139,8 @@ bool THCudaTensor_pointwiseApply1(THCState* state, const dim3 block = getApplyBlock(); dim3 grid; + long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a); + if (!getApplyGrid(state, totalElements, grid)) { return false; } @@ -148,12 +154,13 @@ bool THCudaTensor_pointwiseApply1(THCState* state, // indices of a tensor with overlapping indices should probably be // an error, since it is unclear which one should win), but we will // preserve this last-writer-wins (in arbitrary copy order) behavior. - THCudaTensor* oldA = NULL; + TensorTypeA* oldA = NULL; - if (aType == ReadWrite && THC_overlappingIndices(state, a)) { + if (aType == ReadWrite && + TensorUtils<TensorTypeA>::overlappingIndices(state, a)) { // Must perform in contiguous space oldA = a; - a = THCudaTensor_newContiguous(state, a); + a = TensorUtils<TensorTypeA>::newContiguous(state, a); } // It is possible that the tensor dimensions are able to be collapsed, @@ -164,55 +171,60 @@ bool THCudaTensor_pointwiseApply1(THCState* state, // (or vice versa), the contiguous tensor can be collapsed to one // dimension, and the loop to translate the linear index to the array // index can be similarly collapsed. That is what this unrolling is for. -#define HANDLE_CASE(TYPE, A) \ - THCudaTensor_pointwiseApply1<Op, TYPE, A> \ - <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ +#define HANDLE_CASE(TYPE, A) \ + kernelPointwiseApply1<Op, \ + typename TensorUtils<TensorTypeA>::DataType, \ + TYPE, A> \ + <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ aInfo, (TYPE) totalElements, op); -#define HANDLE_A_CASE(TYPE, A) \ - { \ - if (aInfo.isContiguous()) { \ - HANDLE_CASE(TYPE, -2); \ - } else { \ - switch (A) { \ - case 1: \ - HANDLE_CASE(TYPE, 1); \ - break; \ - case 2: \ - HANDLE_CASE(TYPE, 2); \ - break; \ - case 3: \ - HANDLE_CASE(TYPE, 3); \ - break; \ - default: \ - HANDLE_CASE(TYPE, -1); \ - break; \ - } \ - } \ +#define HANDLE_A_CASE(TYPE, A) \ + { \ + if (aInfo.isContiguous()) { \ + HANDLE_CASE(TYPE, -2); \ + } else { \ + switch (A) { \ + case 1: \ + HANDLE_CASE(TYPE, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, -1); \ + break; \ + } \ + } \ } // Can we use 32-bit integer math in the kernel (the linear ID for the copy // and the resulting non-linear offset is all computable using 32-bit math?) // We also use unsigned index math in the kernel, as signed div/mod has // additional overhead. - if (THC_canUse32BitIndexMath(state, a)) { - TensorInfo<unsigned int> aInfo(state, a); + if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a)) { + TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo = + getTensorInfo<TensorTypeA, unsigned int>(state, a); aInfo.collapseDims(); HANDLE_A_CASE(unsigned int, aInfo.dims); } else { - TensorInfo<unsigned long> aInfo(state, a); + TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo = + getTensorInfo<TensorTypeA, unsigned long>(state, a); aInfo.collapseDims(); // For large tensors, we only compile the completely contiguous // version and the completely generic version, to reduce // compilation time. if (aInfo.isContiguous()) { - THCudaTensor_pointwiseApply1<Op, unsigned long, -2> + kernelPointwiseApply1<Op, + typename TensorUtils<TensorTypeA>::DataType, + unsigned long, -2> <<<grid, block, 0, THCState_getCurrentStream(state)>>>( aInfo, (unsigned long) totalElements, op); } else { - THCudaTensor_pointwiseApply1<Op, unsigned long, -1> + kernelPointwiseApply1<Op, + typename TensorUtils<TensorTypeA>::DataType, + unsigned long, -1> <<<grid, block, 0, THCState_getCurrentStream(state)>>>( aInfo, (unsigned long) totalElements, op); } @@ -221,36 +233,38 @@ bool THCudaTensor_pointwiseApply1(THCState* state, #undef HANDLE_A_CASE if (oldA) { - // Ignore overlaps when copying back; if we use THCudaTensor_copy + // Ignore overlaps when copying back; if we use THCTensor_copy // instead, it will recursively try and invoke ourselves to make // oldA contiguous. - THCudaTensor_copyIgnoringOverlaps(state, oldA, a); - THCudaTensor_free(state, a); + TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a); + TensorUtils<TensorTypeA>::free(state, a); a = oldA; } return true; } -template <typename Op> -bool THCudaTensor_pointwiseApply2(THCState* state, - THCudaTensor* a, - THCudaTensor* b, - const Op& op, - TensorArgType aType = ReadWrite, - TensorArgType bType = ReadOnly) { - long totalElements = THCudaTensor_nElement(state, a); - - if (totalElements != THCudaTensor_nElement(state, b)) { +template <typename TensorTypeA, + typename TensorTypeB, + typename Op> +bool THC_pointwiseApply2(THCState* state, + TensorTypeA* a, + TensorTypeB* b, + const Op& op, + TensorArgType aType = ReadWrite, + TensorArgType bType = ReadOnly) { + long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a); + + if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) { return false; } - if (THCudaTensor_nDimension(state, a) > MAX_CUTORCH_DIMS || - THCudaTensor_nDimension(state, b) > MAX_CUTORCH_DIMS) { + if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS || + TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS) { return false; } - if (THCudaTensor_nDimension(state, a) == 0) { + if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) { // Zero-dim tensor; do nothing return true; } @@ -271,18 +285,20 @@ bool THCudaTensor_pointwiseApply2(THCState* state, // indices of a tensor with overlapping indices should probably be // an error, since it is unclear which one should win), but we will // preserve this last-writer-wins (in arbitrary copy order) behavior. - THCudaTensor* oldA = NULL; - THCudaTensor* oldB = NULL; + TensorTypeA* oldA = NULL; + TensorTypeB* oldB = NULL; - if (aType == ReadWrite && THC_overlappingIndices(state, a)) { + if (aType == ReadWrite && + TensorUtils<TensorTypeA>::overlappingIndices(state, a)) { // Must perform in contiguous space oldA = a; - a = THCudaTensor_newContiguous(state, a); + a = TensorUtils<TensorTypeA>::newContiguous(state, a); } - if (bType == ReadWrite && THC_overlappingIndices(state, b)) { + if (bType == ReadWrite && + TensorUtils<TensorTypeB>::overlappingIndices(state, b)) { // Must perform in contiguous space oldB = b; - b = THCudaTensor_newContiguous(state, b); + b = TensorUtils<TensorTypeB>::newContiguous(state, b); } // It is possible that the tensor dimensions are able to be collapsed, @@ -293,80 +309,87 @@ bool THCudaTensor_pointwiseApply2(THCState* state, // (or vice versa), the contiguous tensor can be collapsed to one // dimension, and the loop to translate the linear index to the array // index can be similarly collapsed. That is what this unrolling is for. -#define HANDLE_CASE(TYPE, A, B) \ - THCudaTensor_pointwiseApply2<Op, TYPE, A, B> \ - <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ +#define HANDLE_CASE(TYPE, A, B) \ + kernelPointwiseApply2<Op, \ + typename TensorUtils<TensorTypeA>::DataType, \ + typename TensorUtils<TensorTypeB>::DataType, \ + TYPE, A, B> \ + <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ aInfo, bInfo, (TYPE) totalElements, op); -#define HANDLE_B_CASE(TYPE, A, B) \ - { \ - if (bInfo.isContiguous()) { \ - HANDLE_CASE(TYPE, A, -2); \ - } else { \ - switch (B) { \ - case 1: \ - HANDLE_CASE(TYPE, A, 1); \ - break; \ - case 2: \ - HANDLE_CASE(TYPE, A, 2); \ - break; \ - case 3: \ - HANDLE_CASE(TYPE, A, 3); \ - break; \ - default: \ - HANDLE_CASE(TYPE, A, -1); \ - break; \ - } \ - } \ - } - -#define HANDLE_A_CASE(TYPE, A, B) \ - { \ - if (aInfo.isContiguous()) { \ - HANDLE_B_CASE(TYPE, -2, B); \ - } else { \ - switch (A) { \ - case 1: \ - HANDLE_B_CASE(TYPE, 1, B); \ - break; \ - case 2: \ - HANDLE_B_CASE(TYPE, 2, B); \ - break; \ - case 3: \ - HANDLE_B_CASE(TYPE, 3, B); \ - break; \ - default: \ - HANDLE_B_CASE(TYPE, -1, B); \ - break; \ - } \ - } \ - } - - if (THC_canUse32BitIndexMath(state, a) && - THC_canUse32BitIndexMath(state, b)) { - TensorInfo<unsigned int> aInfo(state, a); +#define HANDLE_B_CASE(TYPE, A, B) \ + { \ + if (bInfo.isContiguous()) { \ + HANDLE_CASE(TYPE, A, -2); \ + } else { \ + switch (B) { \ + case 1: \ + HANDLE_CASE(TYPE, A, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, A, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, A, -1); \ + break; \ + } \ + } \ + } + +#define HANDLE_A_CASE(TYPE, A, B) \ + { \ + if (aInfo.isContiguous()) { \ + HANDLE_B_CASE(TYPE, -2, B); \ + } else { \ + switch (A) { \ + case 1: \ + HANDLE_B_CASE(TYPE, 1, B); \ + break; \ + case 2: \ + HANDLE_B_CASE(TYPE, 2, B); \ + break; \ + default: \ + HANDLE_B_CASE(TYPE, -1, B); \ + break; \ + } \ + } \ + } + + if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) && + TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b)) { + TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo = + getTensorInfo<TensorTypeA, unsigned int>(state, a); aInfo.collapseDims(); - TensorInfo<unsigned int> bInfo(state, b); + TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo = + getTensorInfo<TensorTypeB, unsigned int>(state, b); bInfo.collapseDims(); HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims); } else { - TensorInfo<unsigned long> aInfo(state, a); + TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo = + getTensorInfo<TensorTypeA, unsigned long>(state, a); aInfo.collapseDims(); - TensorInfo<unsigned long> bInfo(state, b); + TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned long> bInfo = + getTensorInfo<TensorTypeB, unsigned long>(state, b); bInfo.collapseDims(); // For large tensors, we only compile the completely contiguous // version and the completely generic version, to reduce // compilation time. if (aInfo.isContiguous() && bInfo.isContiguous()) { - THCudaTensor_pointwiseApply2<Op, unsigned long, -2, -2> + kernelPointwiseApply2<Op, + typename TensorUtils<TensorTypeA>::DataType, + typename TensorUtils<TensorTypeB>::DataType, + unsigned long, -2, -2> <<<grid, block, 0, THCState_getCurrentStream(state)>>>( aInfo, bInfo, (unsigned long) totalElements, op); } else { - THCudaTensor_pointwiseApply2<Op, unsigned long, -1, -1> + kernelPointwiseApply2<Op, + typename TensorUtils<TensorTypeA>::DataType, + typename TensorUtils<TensorTypeB>::DataType, + unsigned long, -1, -1> <<<grid, block, 0, THCState_getCurrentStream(state)>>>( aInfo, bInfo, (unsigned long) totalElements, op); } @@ -376,49 +399,52 @@ bool THCudaTensor_pointwiseApply2(THCState* state, #undef HANDLE_A_CASE if (oldA) { - // Ignore overlaps when copying back; if we use THCudaTensor_copy + // Ignore overlaps when copying back; if we use THCTensor_copy // instead, it will recursively try and invoke ourselves to make // oldA contiguous. - THCudaTensor_copyIgnoringOverlaps(state, oldA, a); - THCudaTensor_free(state, a); + TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a); + TensorUtils<TensorTypeA>::free(state, a); a = oldA; } if (oldB) { - // Ignore overlaps when copying back; if we use THCudaTensor_copy + // Ignore overlaps when copying back; if we use THCTensor_copy // instead, it will recursively try and invoke ourselves to make // oldB contiguous. - THCudaTensor_copyIgnoringOverlaps(state, oldB, b); - THCudaTensor_free(state, b); + TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b); + TensorUtils<TensorTypeB>::free(state, b); b = oldB; } return true; } -template <typename Op> -bool THCudaTensor_pointwiseApply3(THCState* state, - THCudaTensor* a, - THCudaTensor* b, - THCudaTensor* c, - const Op& op, - TensorArgType aType = ReadWrite, - TensorArgType bType = ReadOnly, - TensorArgType cType = ReadOnly) { - long totalElements = THCudaTensor_nElement(state, a); - - if (totalElements != THCudaTensor_nElement(state, b) || - totalElements != THCudaTensor_nElement(state, c)) { +template <typename TensorTypeA, + typename TensorTypeB, + typename TensorTypeC, + typename Op> +bool THC_pointwiseApply3(THCState* state, + TensorTypeA* a, + TensorTypeB* b, + TensorTypeC* c, + const Op& op, + TensorArgType aType = ReadWrite, + TensorArgType bType = ReadOnly, + TensorArgType cType = ReadOnly) { + long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a); + + if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) || + totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) { return false; } - if (THCudaTensor_nDimension(state, a) > MAX_CUTORCH_DIMS || - THCudaTensor_nDimension(state, b) > MAX_CUTORCH_DIMS || - THCudaTensor_nDimension(state, c) > MAX_CUTORCH_DIMS) { + if (TensorUtils<TensorTypeA>::getDims(state, a) > MAX_CUTORCH_DIMS || + TensorUtils<TensorTypeB>::getDims(state, b) > MAX_CUTORCH_DIMS || + TensorUtils<TensorTypeC>::getDims(state, c) > MAX_CUTORCH_DIMS) { return false; } - if (THCudaTensor_nDimension(state, a) == 0) { + if (TensorUtils<TensorTypeA>::getDims(state, a) == 0) { // Zero-dim tensor; do nothing return true; } @@ -439,131 +465,141 @@ bool THCudaTensor_pointwiseApply3(THCState* state, // indices of a tensor with overlapping indices should probably be // an error, since it is unclear which one should win), but we will // preserve this last-writer-wins (in arbitrary copy order) behavior. - THCudaTensor* oldA = NULL; - THCudaTensor* oldB = NULL; - THCudaTensor* oldC = NULL; + TensorTypeA* oldA = NULL; + TensorTypeB* oldB = NULL; + TensorTypeC* oldC = NULL; - if (aType == ReadWrite && THC_overlappingIndices(state, a)) { + if (aType == ReadWrite && + TensorUtils<TensorTypeA>::overlappingIndices(state, a)) { // Must perform in contiguous space oldA = a; - a = THCudaTensor_newContiguous(state, a); + a = TensorUtils<TensorTypeA>::newContiguous(state, a); } - - if (bType == ReadWrite && THC_overlappingIndices(state, b)) { + if (bType == ReadWrite && + TensorUtils<TensorTypeB>::overlappingIndices(state, b)) { // Must perform in contiguous space oldB = b; - b = THCudaTensor_newContiguous(state, b); + b = TensorUtils<TensorTypeB>::newContiguous(state, b); } - - if (cType == ReadWrite && THC_overlappingIndices(state, c)) { + if (cType == ReadWrite && + TensorUtils<TensorTypeC>::overlappingIndices(state, c)) { // Must perform in contiguous space oldC = c; - c = THCudaTensor_newContiguous(state, c); + c = TensorUtils<TensorTypeC>::newContiguous(state, c); } #define HANDLE_CASE(TYPE, A, B, C) \ - THCudaTensor_pointwiseApply3<Op, TYPE, A, B, C> \ + kernelPointwiseApply3<Op, \ + typename TensorUtils<TensorTypeA>::DataType, \ + typename TensorUtils<TensorTypeB>::DataType, \ + typename TensorUtils<TensorTypeC>::DataType, \ + TYPE, A, B, C> \ <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ aInfo, bInfo, cInfo, (TYPE) totalElements, op); -#define HANDLE_C_CASE(TYPE, A, B, C) \ - { \ - if (cInfo.isContiguous()) { \ - HANDLE_CASE(TYPE, A, B, -2); \ - } else { \ - switch (C) { \ - case 1: \ - HANDLE_CASE(TYPE, A, B, 1); \ - break; \ - case 2: \ - HANDLE_CASE(TYPE, A, B, 2); \ - break; \ - case 3: \ - HANDLE_CASE(TYPE, A, B, 3); \ - break; \ - default: \ - HANDLE_CASE(TYPE, A, B, -1); \ - break; \ - } \ - } \ - } - -#define HANDLE_B_CASE(TYPE, A, B, C) \ - { \ - if (bInfo.isContiguous()) { \ - HANDLE_C_CASE(TYPE, A, -2, C); \ - } else { \ - switch (B) { \ - case 1: \ - HANDLE_C_CASE(TYPE, A, 1, C); \ - break; \ - case 2: \ - HANDLE_C_CASE(TYPE, A, 2, C); \ - break; \ - case 3: \ - HANDLE_C_CASE(TYPE, A, 3, C); \ - break; \ - default: \ - HANDLE_C_CASE(TYPE, A, -1, C); \ - break; \ - } \ - } \ - } - -#define HANDLE_A_CASE(TYPE, A, B, C) \ - { \ - if (aInfo.isContiguous()) { \ - HANDLE_B_CASE(TYPE, -2, B, C); \ - } else { \ - switch (A) { \ - case 1: \ - HANDLE_B_CASE(TYPE, 1, B, C); \ - break; \ - case 2: \ - HANDLE_B_CASE(TYPE, 2, B, C); \ - break; \ - case 3: \ - HANDLE_B_CASE(TYPE, 3, B, C); \ - break; \ - default: \ - HANDLE_B_CASE(TYPE, -1, B, C); \ - break; \ - } \ - } \ - } - - if (THC_canUse32BitIndexMath(state, a) && - THC_canUse32BitIndexMath(state, b) && - THC_canUse32BitIndexMath(state, c)) { - TensorInfo<unsigned int> aInfo(state, a); +#define HANDLE_C_CASE(TYPE, A, B, C) \ + { \ + if (cInfo.isContiguous()) { \ + HANDLE_CASE(TYPE, A, B, -2); \ + } else { \ + switch (C) { \ + case 1: \ + HANDLE_CASE(TYPE, A, B, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, A, B, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, A, B, -1); \ + break; \ + } \ + } \ + } + +#define HANDLE_B_CASE(TYPE, A, B, C) \ + { \ + if (bInfo.isContiguous()) { \ + HANDLE_C_CASE(TYPE, A, -2, C); \ + } else { \ + switch (B) { \ + case 1: \ + HANDLE_C_CASE(TYPE, A, 1, C); \ + break; \ + case 2: \ + HANDLE_C_CASE(TYPE, A, 2, C); \ + break; \ + default: \ + HANDLE_C_CASE(TYPE, A, -1, C); \ + break; \ + } \ + } \ + } + +#define HANDLE_A_CASE(TYPE, A, B, C) \ + { \ + if (aInfo.isContiguous()) { \ + HANDLE_B_CASE(TYPE, -2, B, C); \ + } else { \ + switch (A) { \ + case 1: \ + HANDLE_B_CASE(TYPE, 1, B, C); \ + break; \ + case 2: \ + HANDLE_B_CASE(TYPE, 2, B, C); \ + break; \ + default: \ + HANDLE_B_CASE(TYPE, -1, B, C); \ + break; \ + } \ + } \ + } + + if (TensorUtils<TensorTypeA>::canUse32BitIndexMath(state, a) && + TensorUtils<TensorTypeB>::canUse32BitIndexMath(state, b) && + TensorUtils<TensorTypeC>::canUse32BitIndexMath(state, c)) { + TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned int> aInfo = + getTensorInfo<TensorTypeA, unsigned int>(state, a); aInfo.collapseDims(); - TensorInfo<unsigned int> bInfo(state, b); + TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned int> bInfo = + getTensorInfo<TensorTypeB, unsigned int>(state, b); bInfo.collapseDims(); - TensorInfo<unsigned int> cInfo(state, c); + TensorInfo<typename TensorUtils<TensorTypeC>::DataType, unsigned int> cInfo = + getTensorInfo<TensorTypeC, unsigned int>(state, c); cInfo.collapseDims(); HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims, cInfo.dims); } else { - TensorInfo<unsigned long> aInfo(state, a); + TensorInfo<typename TensorUtils<TensorTypeA>::DataType, unsigned long> aInfo = + getTensorInfo<TensorTypeA, unsigned long>(state, a); aInfo.collapseDims(); - TensorInfo<unsigned long> bInfo(state, b); + TensorInfo<typename TensorUtils<TensorTypeB>::DataType, unsigned long> bInfo = + getTensorInfo<TensorTypeB, unsigned long>(state, b); bInfo.collapseDims(); - TensorInfo<unsigned long> cInfo(state, c); + TensorInfo<typename TensorUtils<TensorTypeC>::DataType, unsigned long> cInfo = + getTensorInfo<TensorTypeC, unsigned long>(state, c); cInfo.collapseDims(); // For large tensors, we only compile the completely contiguous // version and the completely generic version, to reduce // compilation time. if (aInfo.isContiguous() && bInfo.isContiguous() && cInfo.isContiguous()) { - THCudaTensor_pointwiseApply3<Op, unsigned long, -2, -2, -2> + kernelPointwiseApply3<Op, + typename TensorUtils<TensorTypeA>::DataType, + typename TensorUtils<TensorTypeB>::DataType, + typename TensorUtils<TensorTypeC>::DataType, + unsigned long, -2, -2, -2> <<<grid, block, 0, THCState_getCurrentStream(state)>>>( aInfo, bInfo, cInfo, (unsigned long) totalElements, op); } else { - THCudaTensor_pointwiseApply3<Op, unsigned long, -1, -1, -1> + kernelPointwiseApply3<Op, + typename TensorUtils<TensorTypeA>::DataType, + typename TensorUtils<TensorTypeB>::DataType, + typename TensorUtils<TensorTypeC>::DataType, + unsigned long, -1, -1, -1> <<<grid, block, 0, THCState_getCurrentStream(state)>>>( aInfo, bInfo, cInfo, (unsigned long) totalElements, op); } @@ -574,29 +610,29 @@ bool THCudaTensor_pointwiseApply3(THCState* state, #undef HANDLE_A_CASE if (oldA) { - // Ignore overlaps when copying back; if we use THCudaTensor_copy + // Ignore overlaps when copying back; if we use THCTensor_copy // instead, it will recursively try and invoke ourselves to make // oldA contiguous. - THCudaTensor_copyIgnoringOverlaps(state, oldA, a); - THCudaTensor_free(state, a); + TensorUtils<TensorTypeA>::copyIgnoringOverlaps(state, oldA, a); + TensorUtils<TensorTypeA>::free(state, a); a = oldA; } if (oldB) { - // Ignore overlaps when copying back; if we use THCudaTensor_copy + // Ignore overlaps when copying back; if we use THCTensor_copy // instead, it will recursively try and invoke ourselves to make // oldB contiguous. - THCudaTensor_copyIgnoringOverlaps(state, oldB, b); - THCudaTensor_free(state, b); + TensorUtils<TensorTypeB>::copyIgnoringOverlaps(state, oldB, b); + TensorUtils<TensorTypeB>::free(state, b); b = oldB; } if (oldC) { - // Ignore overlaps when copying back; if we use THCudaTensor_copy + // Ignore overlaps when copying back; if we use THCTensor_copy // instead, it will recursively try and invoke ourselves to make // oldC contiguous. - THCudaTensor_copyIgnoringOverlaps(state, oldC, c); - THCudaTensor_free(state, c); + TensorUtils<TensorTypeC>::copyIgnoringOverlaps(state, oldC, c); + TensorUtils<TensorTypeC>::free(state, c); c = oldC; } diff --git a/lib/THC/THCGenerateAllTypes.h b/lib/THC/THCGenerateAllTypes.h index 044cbdf..28037bd 100644 --- a/lib/THC/THCGenerateAllTypes.h +++ b/lib/THC/THCGenerateAllTypes.h @@ -2,6 +2,8 @@ #error "You must define THC_GENERIC_FILE before including THGenerateAllTypes.h" #endif +#include "THCHalf.h" + #define THCTypeIdxByte 1 #define THCTypeIdxChar 2 #define THCTypeIdxShort 3 @@ -12,10 +14,6 @@ #define THCTypeIdxHalf 8 #define THCTypeIdx_(T) TH_CONCAT_2(THCTypeIdx,T) -#define hostreal real -#define hostrealToReal(x) (x) -#define realToHostreal(x) (x) - #define real unsigned char #define accreal long #define Real Byte @@ -108,14 +106,7 @@ #undef CReal #undef THC_REAL_IS_DOUBLE -#if CUDA_VERSION >= 7050 - -#undef hostreal -#undef hostrealToReal -#undef realToHostreal -#define hostreal float -#define hostrealToReal(x) THC_float2half(x); -#define realToHostreal(x) THC_half2float(x); +#ifdef CUDA_HALF_TENSOR #define real half #define accreal half @@ -130,11 +121,7 @@ #undef CReal #undef THC_REAL_IS_HALF -#endif // CUDA_VERSION >= 7050 - -#undef hostreal -#undef hostrealToReal -#undef realToHostreal +#endif // CUDA_HALF_TENSOR #undef THCTypeIdxByte #undef THCTypeIdxChar diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index e368d2e..7847ef6 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -32,8 +32,25 @@ void THCHalf2Float(THCState *state, float *out, half *in, long len) { float THC_half2float(half a) { - THError("half2float not implemented yet"); - return 0; + unsigned int bits = a.x & 0x7fff; + unsigned int sign = a.x & 0x8000; + unsigned int exp = a.x & 0x7c00; + + bits <<= 13; + sign <<= 16; + + bits += 0x38000000U; + + // flush denormals to 0 + bits = (exp == 0 ? 0 : bits) | sign; + + union { + float f; + unsigned int v; + } conv; + conv.v = bits; + + return conv.f; } /* diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index b20b05c..d87647b 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -3,7 +3,17 @@ #include "THCGeneral.h" +// We compile with CudaHalfTensor support if we have this: #if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 +#define CUDA_HALF_TENSOR 1 +#endif + +// Native fp16 ALU instructions are available if we have this: +#if defined(CUDA_HALF_TENSOR) && (__CUDA_ARCH__ >= 530) +#define CUDA_HALF_INSTRUCTIONS 1 +#endif + +#ifdef CUDA_HALF_TENSOR #include <cuda_fp16.h> #include <stdint.h> @@ -13,6 +23,6 @@ THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len); THC_EXTERNC half THC_float2half(float a); THC_EXTERNC float THC_half2float(half a); -#endif +#endif // CUDA_HALF_TENSOR #endif diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh index 1060e89..89f599f 100644 --- a/lib/THC/THCReduce.cuh +++ b/lib/THC/THCReduce.cuh @@ -20,17 +20,21 @@ __device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() { } // Kernel that handles an entire reduction of a slice of a tensor per each thread -template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims, int BDims> +template <typename ModifyOp, + typename ReduceOp, + typename T, + typename IndexType, + int ADims, int BDims> #if __CUDA_ARCH__ >= 350 __launch_bounds__(32 * 16, 4) #endif __global__ void -THCudaTensor_reduceNoncontigDim(TensorInfo<IndexType> out, - TensorInfo<IndexType> in, +THCudaTensor_reduceNoncontigDim(TensorInfo<T, IndexType> out, + TensorInfo<T, IndexType> in, IndexType reductionStride, IndexType reductionSize, IndexType totalSlices, - float init, + T init, ModifyOp modifyOp, ReduceOp reduceOp) { const IndexType sliceIndex = getReduceNoncontigDimSliceIndex<IndexType>(); @@ -42,13 +46,13 @@ THCudaTensor_reduceNoncontigDim(TensorInfo<IndexType> out, // Each thread picks a point in `out` and `in` for which it is // producing the reduction const IndexType outOffset = - IndexToOffset<IndexType, ADims>::get(sliceIndex, out); + IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out); const IndexType inBaseOffset = - IndexToOffset<IndexType, BDims>::get(sliceIndex, in); + IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in); // For each point in reductionSize, reduce into `r` IndexType inOffset = inBaseOffset; - float r = init; + T r = init; for (IndexType i = 0; i < reductionSize; ++i) { r = reduceOp(r, modifyOp(in.data[inOffset])); @@ -67,13 +71,17 @@ __device__ __forceinline__ IndexType getReduceContigDimSliceIndex() { // Kernel that handles an entire reduction of a slice of a tensor per // each block -template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims, int BDims> +template <typename ModifyOp, + typename ReduceOp, + typename T, + typename IndexType, + int ADims, int BDims> __global__ void -THCudaTensor_reduceContigDim(TensorInfo<IndexType> out, - TensorInfo<IndexType> in, +THCudaTensor_reduceContigDim(TensorInfo<T, IndexType> out, + TensorInfo<T, IndexType> in, IndexType reductionSize, IndexType totalSlices, - float init, + T init, ModifyOp modifyOp, ReduceOp reduceOp) { const IndexType sliceIndex = getReduceContigDimSliceIndex<IndexType>(); @@ -84,23 +92,23 @@ THCudaTensor_reduceContigDim(TensorInfo<IndexType> out, // Get the offset in `out` for the reduction const IndexType outOffset = - IndexToOffset<IndexType, ADims>::get(sliceIndex, out); + IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out); // Get the base offset in `in` for this block's reduction const IndexType inBaseOffset = - IndexToOffset<IndexType, BDims>::get(sliceIndex, in); + IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in); // Each thread in the block will reduce some subset of elements in // the slice. The elements are guaranteed contiguous starting at // `inBaseOffset`. - float r = init; + T r = init; for (IndexType i = threadIdx.x; i < reductionSize; i += blockDim.x) { r = reduceOp(r, modifyOp(in.data[inBaseOffset + i])); } // Reduce within the block - extern __shared__ float smem[]; - r = reduceBlock<float, ReduceOp>(smem, blockDim.x, r, reduceOp, init); + extern __shared__ T smem[]; + r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init); if (threadIdx.x == 0) { // Write out reduced value @@ -208,86 +216,92 @@ bool THCudaTensor_reduceDim(THCState* state, // (or vice versa), the contiguous tensor can be collapsed to one // dimension, and the loop to translate the linear index to the array // index can be similarly collapsed. That is what this unrolling is for. -#define HANDLE_CASE(TYPE, OUT, IN) \ +#define HANDLE_CASE(T, TYPE, OUT, IN) \ if (contigReduction) { \ - THCudaTensor_reduceContigDim<ModifyOp, ReduceOp, TYPE, OUT, IN> \ + THCudaTensor_reduceContigDim<ModifyOp, ReduceOp, T, TYPE, OUT, IN> \ <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( \ outInfo, inInfo, reductionSize, \ (TYPE) outElements, init, modifyOp, reduceOp); \ } else { \ - THCudaTensor_reduceNoncontigDim<ModifyOp, ReduceOp, TYPE, OUT, IN> \ + THCudaTensor_reduceNoncontigDim<ModifyOp, ReduceOp, T, TYPE, OUT, IN> \ <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ outInfo, inInfo, reductionStride, reductionSize, \ (TYPE) outElements, init, modifyOp, reduceOp); \ } \ -#define HANDLE_IN_CASE(TYPE, OUT, IN) \ - { \ - if (inInfo.isContiguous()) { \ - HANDLE_CASE(TYPE, OUT, -2); \ - } else { \ - switch (IN) { \ - case 1: \ - HANDLE_CASE(TYPE, OUT, 1); \ - break; \ - case 2: \ - HANDLE_CASE(TYPE, OUT, 2); \ - break; \ - case 3: \ - HANDLE_CASE(TYPE, OUT, 3); \ - break; \ - default: \ - HANDLE_CASE(TYPE, OUT, -1); \ - break; \ - } \ - } \ +#define HANDLE_IN_CASE(T, TYPE, OUT, IN) \ + { \ + if (inInfo.isContiguous()) { \ + HANDLE_CASE(T, TYPE, OUT, -2); \ + } else { \ + switch (IN) { \ + case 1: \ + HANDLE_CASE(T, TYPE, OUT, 1); \ + break; \ + case 2: \ + HANDLE_CASE(T, TYPE, OUT, 2); \ + break; \ + default: \ + HANDLE_CASE(T, TYPE, OUT, -1); \ + break; \ + } \ + } \ } -#define HANDLE_OUT_CASE(TYPE, OUT, IN) \ - { \ - if (outInfo.isContiguous()) { \ - HANDLE_IN_CASE(TYPE, -2, IN); \ - } else { \ - switch (OUT) { \ - case 1: \ - HANDLE_IN_CASE(TYPE, 1, IN); \ - break; \ - case 2: \ - HANDLE_IN_CASE(TYPE, 2, IN); \ - break; \ - case 3: \ - HANDLE_IN_CASE(TYPE, 3, IN); \ - break; \ - default: \ - HANDLE_IN_CASE(TYPE, -1, IN); \ - break; \ - } \ - } \ +#define HANDLE_OUT_CASE(T, TYPE, OUT, IN) \ + { \ + if (outInfo.isContiguous()) { \ + HANDLE_IN_CASE(T, TYPE, -2, IN); \ + } else { \ + switch (OUT) { \ + case 1: \ + HANDLE_IN_CASE(T, TYPE, 1, IN); \ + break; \ + case 2: \ + HANDLE_IN_CASE(T, TYPE, 2, IN); \ + break; \ + case 3: \ + HANDLE_IN_CASE(T, TYPE, 3, IN); \ + break; \ + default: \ + HANDLE_IN_CASE(T, TYPE, -1, IN); \ + break; \ + } \ + } \ } - if (THC_canUse32BitIndexMath(state, out) && - THC_canUse32BitIndexMath(state, in)) { - TensorInfo<unsigned int> outInfo(state, out); + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, out) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, in)) { + TensorInfo<float, unsigned int> outInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, out); outInfo.collapseDims(); - TensorInfo<unsigned int> inInfo(state, in, dim); + TensorInfo<float, unsigned int> inInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, in); + inInfo.reduceDim(dim); inInfo.collapseDims(); - HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims); + HANDLE_OUT_CASE(typename TensorUtils<THCudaTensor>::DataType, + unsigned int, outInfo.dims, inInfo.dims); } else { - TensorInfo<unsigned long> outInfo(state, out); + TensorInfo<float, unsigned long> outInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, out); outInfo.collapseDims(); - TensorInfo<unsigned long> inInfo(state, in, dim); + TensorInfo<float, unsigned long> inInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, in); + inInfo.reduceDim(dim); inInfo.collapseDims(); // For large tensors, we only compile the completely contiguous // version and the completely generic version, to reduce // compilation time. if (outInfo.isContiguous() && inInfo.isContiguous()) { - HANDLE_CASE(unsigned long, -2, -2); + HANDLE_CASE(typename TensorUtils<THCudaTensor>::DataType, + unsigned long, -2, -2); } else { - HANDLE_CASE(unsigned long, -1, -1); + HANDLE_CASE(typename TensorUtils<THCudaTensor>::DataType, + unsigned long, -1, -1); } } #undef HANDLE_CASE diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh index cfe40fd..3fe81a9 100644 --- a/lib/THC/THCReduceAll.cuh +++ b/lib/THC/THCReduceAll.cuh @@ -18,24 +18,28 @@ #define THC_TWO_PASS_REDUCTION_SIZE 2048L // Kernel that handles an entire reduction of a tensor in one pass -template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims> +template <typename ModifyOp, + typename ReduceOp, + typename T, + typename IndexType, + int ADims> __global__ void -THCudaTensor_reduceAll(TensorInfo<IndexType> in, +THCudaTensor_reduceAll(TensorInfo<T, IndexType> in, IndexType totalElements, - float init, + T init, ModifyOp modifyOp, ReduceOp reduceOp, - float* out) { + T* out) { // With a block-wide stride, have each thread perform its own reduction. - float r = init; + T r = init; for (IndexType i = threadIdx.x; i < totalElements; i += blockDim.x) { - const IndexType inOffset = IndexToOffset<IndexType, ADims>::get(i, in); + const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in); r = reduceOp(r, modifyOp(in.data[inOffset])); } // Reduce within the block - extern __shared__ float smem[]; - r = reduceBlock<float, ReduceOp>(smem, blockDim.x, r, reduceOp, init); + extern __shared__ T smem[]; + r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init); if (threadIdx.x == 0) { // Write out reduced value @@ -56,27 +60,31 @@ __device__ __forceinline__ IndexType getEndIndex(IndexType totalSize) { } // Kernel that handles an entire reduction of a tensor in two passes -template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims> +template <typename ModifyOp, + typename ReduceOp, + typename T, + typename IndexType, + int ADims> __global__ void -THCudaTensor_reduceAllPass1(TensorInfo<IndexType> in, +THCudaTensor_reduceAllPass1(TensorInfo<T, IndexType> in, IndexType totalElements, - float init, + T init, ModifyOp modifyOp, ReduceOp reduceOp, - float* scratchSpace) { + T* scratchSpace) { const IndexType startIndex = getStartIndex<IndexType>(totalElements); const IndexType endIndex = getEndIndex<IndexType>(totalElements); // With a block-wide stride, have each thread perform its own reduction. - float r = init; + T r = init; for (IndexType i = startIndex + threadIdx.x; i < endIndex; i += blockDim.x) { - const IndexType inOffset = IndexToOffset<IndexType, ADims>::get(i, in); + const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in); r = reduceOp(r, modifyOp(in.data[inOffset])); } // Reduce within the block - extern __shared__ float smem[]; - r = reduceBlock<float, ReduceOp>(smem, blockDim.x, r, reduceOp, init); + extern __shared__ T smem[]; + r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init); if (threadIdx.x == 0) { // Write out block-wide reduced value @@ -84,21 +92,21 @@ THCudaTensor_reduceAllPass1(TensorInfo<IndexType> in, } } -template <typename ReduceOp, typename IndexType> +template <typename ReduceOp, typename T, typename IndexType> __global__ void THCudaTensor_reduceAllPass2(int numPass1Blocks, - float init, + T init, ReduceOp reduceOp, - float* scratchSpace, - float* out) { - float r = init; + T* scratchSpace, + T* out) { + T r = init; if (threadIdx.x < numPass1Blocks) { r = scratchSpace[threadIdx.x]; } // Reduce within the block - extern __shared__ float smem[]; - r = reduceBlock<float, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init); + extern __shared__ T smem[]; + r = reduceBlock<T, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init); if (threadIdx.x == 0) { *out = r; @@ -111,12 +119,13 @@ inline bool isTwoPassReductionSize(long elements) { return (elements > THC_TWO_PASS_REDUCTION_SIZE); } +template <typename T> inline long getTwoPassBlocks(THCState* state, long elements) { long numBlocks = THCCeilDiv(elements, THC_REDUCE_ALL_BLOCK_SIZE); // We can only have as many blocks as there is scratch space long scratchSpace = - THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(float); + THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(T); THAssert(scratchSpace > 0); if (numBlocks > scratchSpace) { @@ -127,60 +136,67 @@ inline long getTwoPassBlocks(THCState* state, long elements) { } // Get the block/grid size that we want +template <typename T> inline void getPass1ReduceBlockGrid(THCState* state, long elements, dim3& grid, dim3& block) { - grid = dim3(getTwoPassBlocks(state, elements)); + grid = dim3(getTwoPassBlocks<T>(state, elements)); block = dim3(THC_REDUCE_ALL_BLOCK_SIZE); } +template <typename T> inline void getPass2ReduceBlockGrid(THCState* state, long elements, dim3& grid, dim3& block) { grid = dim3(1); // We only need as many threads as there were blocks originally - block = dim3(getTwoPassBlocks(state, elements)); + block = dim3(getTwoPassBlocks<T>(state, elements)); } +template <typename T> inline void getSinglePassReduceBlockGrid(long elements, dim3& grid, dim3& block) { grid = dim3(1); block = dim3(THC_REDUCE_ALL_BLOCK_SIZE); } -template <typename ModifyOp, typename ReduceOp, typename IndexType, int ADims> +template <typename ModifyOp, + typename ReduceOp, + typename T, + typename IndexType, + int ADims> void callReduceAll(THCState* state, - const TensorInfo<IndexType>& in, + const TensorInfo<T, IndexType>& in, long totalElements, - float init, + T init, const ModifyOp& modifyOp, const ReduceOp& reduceOp, - float* devOut) { + T* devOut) { dim3 grid; dim3 block; if (isTwoPassReductionSize(totalElements)) { - getPass1ReduceBlockGrid(state, totalElements, grid, block); - size_t smemSize = block.x * sizeof(float); + getPass1ReduceBlockGrid<T>(state, totalElements, grid, block); + size_t smemSize = block.x * sizeof(T); - THCudaTensor_reduceAllPass1<ModifyOp, ReduceOp, IndexType, ADims> + THCudaTensor_reduceAllPass1<ModifyOp, ReduceOp, T, IndexType, ADims> <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( in, (IndexType) totalElements, init, modifyOp, reduceOp, - (float*) THCState_getCurrentDeviceScratchSpace(state)); + (T*) THCState_getCurrentDeviceScratchSpace(state)); int numPass1Blocks = grid.x; - getPass2ReduceBlockGrid(state, totalElements, grid, block); - smemSize = block.x * sizeof(float); + getPass2ReduceBlockGrid<T>(state, totalElements, grid, block); + smemSize = block.x * sizeof(T); - THCudaTensor_reduceAllPass2<ReduceOp, IndexType> + THCudaTensor_reduceAllPass2<ReduceOp, T, IndexType> <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( numPass1Blocks, init, reduceOp, - (float*) THCState_getCurrentDeviceScratchSpace(state), + (T*) THCState_getCurrentDeviceScratchSpace(state), devOut); } else { - getSinglePassReduceBlockGrid(totalElements, grid, block); - size_t smemSize = block.x * sizeof(float); + getSinglePassReduceBlockGrid<T>(totalElements, grid, block); + size_t smemSize = block.x * sizeof(T); - THCudaTensor_reduceAll<ModifyOp, ReduceOp, IndexType, ADims> + THCudaTensor_reduceAll<ModifyOp, ReduceOp, T, IndexType, ADims> <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( in, (IndexType) totalElements, init, modifyOp, reduceOp, devOut); } @@ -224,8 +240,10 @@ bool THCudaTensor_reduceAll(THCState* state, // dimension, and the loop to translate the linear index to the array // index can be similarly collapsed. That is what this unrolling is for. #define HANDLE_CASE(TYPE, IN) \ - callReduceAll<ModifyOp, ReduceOp, TYPE, IN>( \ - state, inInfo, inElements, init, modifyOp, reduceOp, devOut); + callReduceAll<ModifyOp, ReduceOp, \ + typename TensorUtils<THCudaTensor>::DataType, \ + TYPE, IN>( \ + state, inInfo, inElements, init, modifyOp, reduceOp, devOut); #define HANDLE_IN_CASE(TYPE, IN) \ { \ @@ -249,13 +267,15 @@ bool THCudaTensor_reduceAll(THCState* state, } \ } - if (THC_canUse32BitIndexMath(state, in)) { - TensorInfo<unsigned int> inInfo(state, in); + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, in)) { + TensorInfo<float, unsigned int> inInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, in); inInfo.collapseDims(); HANDLE_IN_CASE(unsigned int, inInfo.dims); } else { - TensorInfo<unsigned long long> inInfo(state, in); + TensorInfo<float, unsigned long long> inInfo = + getTensorInfo<THCudaTensor, unsigned long long>(state, in); inInfo.collapseDims(); // For large tensors, we only compile the completely contiguous diff --git a/lib/THC/THCReduceApplyUtils.cu b/lib/THC/THCReduceApplyUtils.cu index 10d5d89..f92bb99 100644 --- a/lib/THC/THCReduceApplyUtils.cu +++ b/lib/THC/THCReduceApplyUtils.cu @@ -11,29 +11,6 @@ void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg) { THArgCheck(dims <= MAX_CUTORCH_DIMS, arg, CUTORCH_DIM_WARNING); } -bool THC_canUse32BitIndexMath(THCState* state, THCudaTensor* t) { - long elements = THCudaTensor_nElement(state, t); - if (elements >= UINT_MAX) { - return false; - } - - long offset = 0; - long linearId = elements - 1; - - for (int i = THCudaTensor_nDimension(state, t) - 1; i >= 0; --i) { - long curDimIndex = linearId % THCudaTensor_size(state, t, i); - long curDimOffset = curDimIndex * THCudaTensor_stride(state, t, i); - offset += curDimOffset; - linearId /= THCudaTensor_size(state, t, i); - } - - if (offset >= UINT_MAX) { - return false; - } - - return true; -} - bool THC_getGridFromTiles(long gridTiles, dim3& grid) { if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) { return false; @@ -56,73 +33,3 @@ bool THC_getGridFromTiles(long gridTiles, dim3& grid) { grid = dim3(gridX, gridY, gridZ); return true; } - -namespace { - -struct SizeAndStride { - long size; - long stride; -}; - -int compareSizeAndStride(const void* a, const void* b) { - const SizeAndStride* aS = (const SizeAndStride*) a; - const SizeAndStride* bS = (const SizeAndStride*) b; - - return aS->stride < bS->stride; -} - -} - -bool THC_overlappingIndices(THCState* state, THCudaTensor* t) { - // In this function, we don't care about permutations of the - // size/stride arrays (transpositions). - // We order the size/stride arrays by stride, skipping dimensions of - // size 1. Strides of dimensions of size 1 don't matter, since there - // is only one addressing point in them. - // In this reordered view, the tensor is contiguous if - // stride[dim] == size[dim + 1] * stride[dim + 1] for all `dim`. - // The tensor has holes if - // stride[dim] > size[dim + 1] * stride[dim + 1] for one or more - // `dim`. - // The tensor has overlaps if - // stride[dim] < size[dim + 1] * stride[dim + 1] for one or more - // `dim`, or the innermost stride is 0. - - // Extract size/stride arrays; only consider size >1 dims. - SizeAndStride info[MAX_CUTORCH_DIMS]; - - int dims = THCudaTensor_nDimension(state, t); - int nonSize1Dims = 0; - for (int i = 0; i < dims; ++i) { - long size = THCudaTensor_size(state, t, i); - if (size > 1) { - info[nonSize1Dims].size = size; - info[nonSize1Dims].stride = THCudaTensor_stride(state, t, i); - ++nonSize1Dims; - } - } - - if (nonSize1Dims == 0) { - // no overlap - return false; - } - - // Ascending order (innermost dimension in sorted view is at [0]) - qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride); - - // Base case: innermost dimension must have stride >= 1 - if (info[nonSize1Dims - 1].stride < 1) { - return true; - } - - // Subsequent dimensions, if any - for (int i = nonSize1Dims - 2; i >= 0; --i) { - if (info[i].stride < info[i + 1].size * info[i + 1].stride) { - // There are overlaps - return true; - } - } - - // Tensor has holes or is contiguous - return false; -} diff --git a/lib/THC/THCReduceApplyUtils.cuh b/lib/THC/THCReduceApplyUtils.cuh index d5ac2b5..06f969f 100644 --- a/lib/THC/THCReduceApplyUtils.cuh +++ b/lib/THC/THCReduceApplyUtils.cuh @@ -6,284 +6,12 @@ #include "THCGeneral.h" #include "THCTensor.h" #include "THCDeviceUtils.cuh" - -// Maximum number of dimensions allowed for cutorch -#define MAX_CUTORCH_DIMS 25 - -// Warning string for tensor arguments that are too large or have too -// many dimensions -#define CUTORCH_STR(X) #X -#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \ - CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions" +#include "THCTensorInfo.cuh" // Enum that indicates whether tensor arguments are read/write or // read-only enum TensorArgType { ReadWrite, ReadOnly }; -// Copy operator for the pointwise apply kernel -template <typename T> -struct CopyOp { - __device__ __forceinline__ void operator()(T* dst, T* src) { -#if __CUDA_ARCH__ >= 350 - *dst = __ldg(src); -#else - *dst = *src; -#endif - } -}; - -// CUDA kernel argument that defines tensor layout -template <typename IndexType> -struct TensorInfo { - // Extracts size/stride information for the kernel. - // The optional `reduceDim` indicates a reduction dimension for the - // given tensor, so that the resulting size for this dimension will be 1. - TensorInfo(THCState* state, THCudaTensor* t, int reduceDim = -1); - - // Collapses all runs of successive dimensions if the size/strides - // match up within the run and there are no holes between the - // dimensions. - // If excludeDim is set (not -1), then excludeDim will not be - // collapsed with any other dimension. - // Function returns the new dimension index that excludeDim maps to, - // since the collapsed dimensions are <= the input dimensions. - int collapseDims(int excludeDim = -1); - - // Contiguous tensors of more than one dimension are collapsed down - // to one tensor - __host__ __device__ inline bool isContiguous() const { - return (dims == 1 && strides[0] == 1); - } - - float* data; - IndexType sizes[MAX_CUTORCH_DIMS]; - IndexType strides[MAX_CUTORCH_DIMS]; - int dims; -}; - -template <typename IndexType> -TensorInfo<IndexType>::TensorInfo(THCState* state, - THCudaTensor* t, - int reduceDim) { - data = THCudaTensor_data(state, t); - dims = THCudaTensor_nDimension(state, t); - assert(dims <= MAX_CUTORCH_DIMS); - - for (int i = 0; i < dims; ++i) { - sizes[i] = THCudaTensor_size(state, t, i); - strides[i] = THCudaTensor_stride(state, t, i); - } - - assert(reduceDim == -1 || (reduceDim < dims && reduceDim >= 0)); - - if (reduceDim != -1) { - sizes[reduceDim] = 1; - } -} - -template <typename IndexType> -int -TensorInfo<IndexType>::collapseDims(int excludeDim) { - // Find the innermost dimension not of size 1, since dimensions of size 1 are - // collapsible. - int firstNonOneDim = -1; - - for (int i = dims - 1; i >= 0; --i) { - if (i == excludeDim) { - // We cannot collapse this dimension, even if it is size 1 - firstNonOneDim = i; - break; - } - - if (sizes[i] != 1) { - firstNonOneDim = i; - break; - } - } - - // Special case: if all dimensions are of size 1, then this is a - // single-point tensor that we still have to operate on. Reduce to a - // single point. - if (firstNonOneDim == -1) { - assert(excludeDim == -1); - - dims = 1; - sizes[0] = 1; - strides[0] = 1; - - // Everything effectively got collapsed into this dimension - return 0; - } - - // Count the number of successive dimensions that can be collapsed, from - // innermost to outermost. - int numCollapsed = 0; - - // Skip the leading size 1 dims - numCollapsed += dims - 1 - firstNonOneDim; - - // We perform one pass through to determine how many dimensions we - // can collapse, before calculating the actual size of the collapsed - // dimensions. - // size/strideInner are the size/strides of the previous inner - // non-collapsible dim we encounter. - long sizeInner = sizes[firstNonOneDim]; - long strideInner = strides[firstNonOneDim]; - - for (int i = firstNonOneDim - 1; i >= 0; --i) { - long sizeOuter = sizes[i]; - long strideOuter = strides[i]; - - // Don't collapse this dimension if we want to exclude it from - // collapsing. - // Since this code is attempting to collapse a subsequent - // dimension (i) with the preceding dimension (i + 1), we can only - // perform collapsing if the preceding dimension can be collapsed - // (i.e., not excludeDim) - if ((excludeDim != i) && (excludeDim != i + 1)) { - // The next outermost dimension can be skipped if size 1 - if (sizeOuter == 1) { - ++numCollapsed; - continue; - } - - // If the next outermost dimension is contiguous with the - // previous non-collapsed one, collapse it - if (strideOuter == strideInner * sizeInner) { - ++numCollapsed; - - // This is the run of collapsed dimensions' size - sizeInner = sizeInner * sizeOuter; - continue; - } - } - - // Otherwise, this new outer dimension at `i` cannot be collapsed - // because it is excluded from collapsing, or it is not contiguous - // with the previous inner dimension. - sizeInner = sizeOuter; - strideInner = strideOuter; - } - - // This will be our new size/stride and dimension. - IndexType newSizes[MAX_CUTORCH_DIMS]; - IndexType newStrides[MAX_CUTORCH_DIMS]; - - assert(numCollapsed < dims); - int newDims = dims - numCollapsed; - - // We return the index of the excluded dimension that is excluded - // from being collapsed here. - int returnDim = -1; - - // We perform a second pass through the dimensions to actually - // calculate the size of the collapsed dimensions. - int collapsedIndex = dims - numCollapsed - 1; - newSizes[collapsedIndex] = sizes[firstNonOneDim]; - newStrides[collapsedIndex] = strides[firstNonOneDim]; - - if (firstNonOneDim == excludeDim) { - returnDim = collapsedIndex; - } - - for (int i = firstNonOneDim - 1; i >= 0; --i) { - IndexType sizeOuter = sizes[i]; - IndexType strideOuter = strides[i]; - - if ((excludeDim != i) && (excludeDim != i + 1)) { - if (sizeOuter == 1) { - // skip - continue; - } - - if (strideOuter == newSizes[collapsedIndex] * newStrides[collapsedIndex]) { - // collapse - newSizes[collapsedIndex] *= sizeOuter; - continue; - } - } - - // Otherwise, strides don't match, or dim `i` is excluded from - // collapsing. - --collapsedIndex; - assert(collapsedIndex >= 0); - assert(collapsedIndex < newDims); - newSizes[collapsedIndex] = sizeOuter; - newStrides[collapsedIndex] = strideOuter; - - if (excludeDim == i) { - returnDim = collapsedIndex; - } - } - - // We must have filled all the dimensions we're looking for - assert(collapsedIndex == 0); - assert((excludeDim == -1) || (returnDim != -1)); - - dims = newDims; - - for (int i = 0; i < dims; ++i) { - sizes[i] = newSizes[i]; - strides[i] = newStrides[i]; - } - - // After collapsing, the original `excludeDim` may have been - // renumbered to this new `returnDim`, since some dimensions could - // have been collapsed. - return returnDim; -} - -// Translate a linear index for the apply to a float* offset; -// specialized on `Dims` to reduce nvcc compilation time -template <typename IndexType, int Dims> -struct IndexToOffset { - static __host__ __device__ IndexType get( - IndexType linearId, - const TensorInfo<IndexType>& info) { - IndexType offset = 0; - - // Use static dims - for (int i = Dims - 1; i >= 0; --i) { - IndexType curDimIndex = linearId % info.sizes[i]; - IndexType curDimOffset = curDimIndex * info.strides[i]; - offset += curDimOffset; - - if (i > 0) { - linearId /= info.sizes[i]; - } - } - - return offset; - } -}; - -template <typename IndexType> -struct IndexToOffset<IndexType, -2> { - static __forceinline__ __host__ __device__ IndexType - get(IndexType linearId, const TensorInfo<IndexType>& info) { - return linearId; - } -}; - -template <typename IndexType> -struct IndexToOffset<IndexType, -1> { - static __forceinline__ __host__ __device__ IndexType - get(IndexType linearId, const TensorInfo<IndexType>& info) { - IndexType offset = 0; - - // Use dynamic dims - for (int i = info.dims - 1; i >= 0; --i) { - IndexType curDimIndex = linearId % info.sizes[i]; - IndexType curDimOffset = curDimIndex * info.strides[i]; - offset += curDimOffset; - - linearId /= info.sizes[i]; - } - - return offset; - } -}; - template <typename IndexType> __device__ __forceinline__ IndexType getLinearBlockId() { return blockIdx.z * gridDim.y * gridDim.x + @@ -347,16 +75,7 @@ __device__ T reduceBlock(T* smem, // Make sure the given tensor doesn't have too many dimensions void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg); -// Returns true if all linear ID -> offset math can be performed using 32 bit -// unsigned math, which is faster than 64 bit math -THC_API bool THC_canUse32BitIndexMath(THCState* state, THCudaTensor* t); - // Produces a grid with at least one point per tile THC_API bool THC_getGridFromTiles(long gridTiles, dim3& grid); -// Determines if the given tensor has overlapping data points (i.e., -// is there more than one index into the tensor that references the -// same piece of data)? -THC_API bool THC_overlappingIndices(THCState* state, THCudaTensor* t); - #endif // THC_REDUCE_APPLY_UTILS_INC diff --git a/lib/THC/THCSortUtils.cuh b/lib/THC/THCSortUtils.cuh index 42a8018..baf4a8e 100644 --- a/lib/THC/THCSortUtils.cuh +++ b/lib/THC/THCSortUtils.cuh @@ -2,6 +2,7 @@ #define THC_SORT_UTILS_INC #include "THCReduceApplyUtils.cuh" +#include "THCTensorTypeUtils.cuh" // Collection of kernel sort routines template <typename T> @@ -91,11 +92,11 @@ template <typename K, typename V, int KeyDims, int ValueDims, typename Comparator, typename IndexType, int Power2SortSize> __global__ void -bitonicSortKVInPlace(TensorInfo<IndexType> keys, +bitonicSortKVInPlace(TensorInfo<K, IndexType> keys, IndexType keySlices, IndexType keySliceSize, IndexType keySliceStride, - TensorInfo<IndexType> values, + TensorInfo<V, IndexType> values, IndexType valueSliceStride, const Comparator& comp) { // Find the slice of the tensor that we are sorting @@ -111,9 +112,9 @@ bitonicSortKVInPlace(TensorInfo<IndexType> keys, __shared__ bool sharedValid[Power2SortSize]; const IndexType keyStartOffset = - IndexToOffset<IndexType, KeyDims>::get(linearIndex, keys); + IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys); const IndexType valueStartOffset = - IndexToOffset<IndexType, ValueDims>::get(linearIndex, values); + IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values); // If the sort size is 1, the data is already sorted if (Power2SortSize == 1) { diff --git a/lib/THC/THCStorageCopy.h b/lib/THC/THCStorageCopy.h index ec8011d..837056f 100644 --- a/lib/THC/THCStorageCopy.h +++ b/lib/THC/THCStorageCopy.h @@ -3,6 +3,7 @@ #include "THCStorage.h" #include "THCGeneral.h" +#include "THCHalf.h" #include "generic/THCStorageCopy.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu index f6545a6..ea38d2e 100644 --- a/lib/THC/THCTensorCopy.cu +++ b/lib/THC/THCTensorCopy.cu @@ -7,5 +7,226 @@ inline int curGPU() { return curDev; } +// Copy operator for the pointwise apply kernel +template <typename TypeDst, typename TypeSrc> +struct CopyOp { + __device__ __forceinline__ void operator()(TypeDst* dst, TypeSrc* src) { +#if __CUDA_ARCH__ >= 350 + *dst = ScalarConvert<TypeSrc, TypeDst>::to(__ldg(src)); +#else + *dst = ScalarConvert<TypeSrc, TypeDst>::to(*src); +#endif + } +}; + +// Copy for the same type to the same type +template <typename TensorTypeDst, typename TensorTypeSrc> +void +THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) { + long totalElements = TensorUtils<TensorTypeDst>::getNumElements(state, dst); + + THArgCheck(totalElements == + TensorUtils<TensorTypeSrc>::getNumElements(state, src), + 2, "sizes do not match"); + + if (TensorUtils<TensorTypeDst>::getDims(state, dst) == 0) { + // Zero-dim tensor; copy nothing + return; + } + + // We can memcpy the memory if: + // -both tensors are contiguous; or, + // -there is only one element to copy; or, + // -FIXME: if both tensors have matching size and stride arrays, and no + // holes within (in other words, there is some permutation that can be applied + // to the size/strides such that the resulting tensor is + // contiguous). + // -AND: both tensors have the same type. + bool sameType = isSameType<TensorTypeSrc, TensorTypeDst>(); + bool srcContig = TensorUtils<TensorTypeSrc>::isContiguous(state, src); + bool dstContig = TensorUtils<TensorTypeDst>::isContiguous(state, dst); + bool memcpyEligible = + ((srcContig && dstContig) || (totalElements == 1)) && sameType; + + + int srcDev = TensorUtils<TensorTypeSrc>::getDevice(state, src); + int dstDev = TensorUtils<TensorTypeDst>::getDevice(state, dst); + int oldDev = curGPU(); + + // We always perform the copy on the source device, using the + // current stream on the source device. + // If the copy is on the default stream, then we fully synchronize + // both src and dst's default streams for completion of the + // copy. We have to explicitly do this for non-contig copies. + // This mimics the behavior of cross-device cudaMemcpyAsync on + // the default stream. + // If the copy is not on the default stream, then it is up to the + // user to add needed synchronization on the dst device, since the + // stream on the dst device that wishes to synchronize may not be + // the same index as the one on the src device. + int copyStreamIndex = + THCState_getCurrentStreamIndex(state); + cudaStream_t copyStream = + THCState_getDeviceStream(state, srcDev, copyStreamIndex); + + if (srcDev != dstDev && copyStreamIndex == 0) { + // This is a cross-device copy on the default stream. We perform a + // two-way barrier between both devices' default streams before + // the copy. This ensures that any write-after-write and + // write-after-read dependencies on the destination side are + // handled, so that no one is operating on the dst memory when + // we perform the copy. + // src waits on dst barrier (src already waits on src) + cudaEvent_t dstReady; + THCudaCheck(cudaSetDevice(dstDev)); + THCudaCheck(cudaEventCreateWithFlags(&dstReady, cudaEventDisableTiming)); + THCudaCheck(cudaEventRecord(dstReady, NULL)); + + THCudaCheck(cudaSetDevice(srcDev)); + THCudaCheck(cudaStreamWaitEvent(NULL, dstReady, 0)); + THCudaCheck(cudaEventDestroy(dstReady)); + } else if (srcDev != oldDev) { + THCudaCheck(cudaSetDevice(srcDev)); + } + + // We are now on srcDev + if (memcpyEligible) { + // Perform the copy + THCudaCheck(cudaMemcpyAsync( + TensorUtils<TensorTypeDst>::getData(state, dst), + TensorUtils<TensorTypeSrc>::getData(state, src), + totalElements * + sizeof(typename TensorUtils<TensorTypeDst>::DataType), + cudaMemcpyDeviceToDevice, + copyStream)); + } else { + // Non-contiguous copy or a type-conversion copy + + // We avoid creating temporary memory copies if possible. + // If both src and dst are on the same device, or if they are on + // different devices and p2p access is enabled, perform the copy + // by a pointwise copy kernel. + // Otherwise, we'll have to make contiguous (which will in fact + // invoke copy() again), and then perform the copy. + // FIXME: might want to consider only running the pointwise kernel + // if both src and dst innermost dimensions are contiguous. If + // they are not, then taking the hit of the memory allocation/free + // might be worth it to avoid non-coalesced reads or writes. + + // A device always has access to itself, so this also handles the + // case srcDev == dstDev + if (THCState_getPeerToPeerAccess(state, srcDev, dstDev)) { + // Make sure we have the current stream set in THCState, since + // pointwise uses that + if (srcDev != oldDev) { + THCState_setStream(state, srcDev, copyStreamIndex); + } + + bool succ = + THC_pointwiseApply2( + state, dst, src, + CopyOp<typename TensorUtils<TensorTypeDst>::DataType, + typename TensorUtils<TensorTypeSrc>::DataType>()); + + // Restore prior THCState stream + if (srcDev != oldDev) { + THCState_setStream(state, oldDev, copyStreamIndex); + } + + THArgCheck(succ, 2, CUTORCH_DIM_WARNING); + } else { + // GPUs can't access each other directly, but the tensors + // involved are non-contiguous and/or are different types. + + // Make sure the src is contiguous and in the same type as dst + THCudaCheck(cudaSetDevice(srcDev)); + TensorTypeDst* srcContig = NULL; + + if (sameType) { + srcContig = + (TensorTypeDst*) // this is actually the same type as src + TensorUtils<TensorTypeSrc>::newContiguous(state, src); + + } else { + // Types are different + // Copy into the new format, contiguous, on the source device + srcContig = TensorUtils<TensorTypeDst>::newTensor(state); + TensorUtils<TensorTypeDst>::resizeAs(state, srcContig, dst); + + if (srcDev != oldDev) { + THCState_setStream(state, srcDev, copyStreamIndex); + } + + bool succ = + THC_pointwiseApply2( + state, srcContig, src, + CopyOp<typename TensorUtils<TensorTypeDst>::DataType, + typename TensorUtils<TensorTypeSrc>::DataType>()); + + // Restore prior THCState stream + if (srcDev != oldDev) { + THCState_setStream(state, oldDev, copyStreamIndex); + } + + THArgCheck(succ, 2, CUTORCH_DIM_WARNING); + } + + // Make sure the dst is contiguous + THCudaCheck(cudaSetDevice(dstDev)); + TensorTypeDst* dstContig = + TensorUtils<TensorTypeDst>::newContiguous(state, dst); + + // Now, we are ready for a cross-device memcpy of contiguous + // data, of the same layout and type + THCudaCheck(cudaSetDevice(srcDev)); + + THCudaCheck(cudaMemcpyAsync( + TensorUtils<TensorTypeDst>::getData(state, dstContig), + TensorUtils<TensorTypeDst>::getData(state, srcContig), + totalElements * + sizeof(typename TensorUtils<TensorTypeDst>::DataType), + cudaMemcpyDeviceToDevice, + copyStream)); + + // We are done with the src + TensorUtils<TensorTypeDst>::free(state, srcContig); + + if (dst != dstContig) { + TensorUtils<TensorTypeDst>::freeCopyTo(state, dstContig, dst); + } else { + TensorUtils<TensorTypeDst>::free(state, dstContig); + } + + // We're still on srcDev at this point + } + } + + if (srcDev != dstDev && copyStreamIndex == 0) { + // dst waits on src barrier (dst already waits on dst). We cannot + // operate on dst's copy until the copy is complete. + + // Still on srcDev, record default stream event + cudaEvent_t srcReady; + THCudaCheck(cudaEventCreateWithFlags(&srcReady, cudaEventDisableTiming)); + THCudaCheck(cudaEventRecord(srcReady, NULL)); + + THCudaCheck(cudaSetDevice(dstDev)); + THCudaCheck(cudaStreamWaitEvent(NULL, srcReady, 0)); + THCudaCheck(cudaEventDestroy(srcReady)); + + // We are now on dstDev (right above). Restore prior device from dst + if (dstDev != oldDev) { + THCudaCheck(cudaSetDevice(oldDev)); + } + } else { + // We are still on srcDev. Restore prior device from src + if (srcDev != oldDev) { + THCudaCheck(cudaSetDevice(oldDev)); + } + } + + THCudaCheck(cudaGetLastError()); +} + #include "generic/THCTensorCopy.cu" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorCopy.h b/lib/THC/THCTensorCopy.h index fc206cb..e8bc4f4 100644 --- a/lib/THC/THCTensorCopy.h +++ b/lib/THC/THCTensorCopy.h @@ -3,6 +3,7 @@ #include "THCTensor.h" #include "THCGeneral.h" +#include "THCHalf.h" #include "generic/THCTensorCopy.h" #include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu index a23daa9..e0d721f 100644 --- a/lib/THC/THCTensorIndex.cu +++ b/lib/THC/THCTensorIndex.cu @@ -15,13 +15,13 @@ // indexCopyLargeIndex kernel is a better choice to increase // parallelism. template <typename IndexType, int DstDim, int SrcDim, int IdxDim> -__global__ void indexCopySmallIndex(TensorInfo<IndexType> dst, - TensorInfo<IndexType> src, - TensorInfo<IndexType> indices, - int dstCopyDim, - int srcCopyDim, - IndexType innerSize, - long dstCopyDimSize) { +__global__ void indexCopySmallIndex(TensorInfo<float, IndexType> dst, + TensorInfo<float, IndexType> src, + TensorInfo<float, IndexType> indices, + int dstCopyDim, + int srcCopyDim, + IndexType innerSize, + long dstCopyDimSize) { // In order to avoid reloading the index that we are copying, load // it once to handle all of the points that are being selected, so // it can be reused as much as possible. This kernel is chosen when @@ -30,7 +30,7 @@ __global__ void indexCopySmallIndex(TensorInfo<IndexType> dst, for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) { // Lua indices begin at 1 IndexType dstIndex = - indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1; + indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1; if (dstIndex < dstCopyDimSize) { // We stride over the output ignoring the indexed dimension @@ -39,12 +39,12 @@ __global__ void indexCopySmallIndex(TensorInfo<IndexType> dst, linearIndex < innerSize; linearIndex += gridDim.x * blockDim.x) { IndexType dstOffset = - IndexToOffset<IndexType, DstDim>::get(linearIndex, dst); + IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst); dstOffset += dstIndex * dst.strides[dstCopyDim]; IndexType srcOffset = - IndexToOffset<IndexType, SrcDim>::get(linearIndex, src); + IndexToOffset<float, IndexType, SrcDim>::get(linearIndex, src); srcOffset += srcIndex * src.strides[srcCopyDim]; dst.data[dstOffset] = src.data[srcOffset]; @@ -60,13 +60,13 @@ __global__ void indexCopySmallIndex(TensorInfo<IndexType> dst, // indexCopySmallIndex kernel is a better choice to reduce memory // accesses. template <typename IndexType, int DstDim, int SrcDim, int IdxDim> -__global__ void indexCopyLargeIndex(TensorInfo<IndexType> dst, - TensorInfo<IndexType> src, - TensorInfo<IndexType> indices, - int dstCopyDim, - int srcCopyDim, - IndexType innerSize, - long dstCopyDimSize) { +__global__ void indexCopyLargeIndex(TensorInfo<float, IndexType> dst, + TensorInfo<float, IndexType> src, + TensorInfo<float, IndexType> indices, + int dstCopyDim, + int srcCopyDim, + IndexType innerSize, + long dstCopyDimSize) { // We stride over the output including the indexed dimension // (totalSize), and calculate the destination index point based on that for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; @@ -77,15 +77,15 @@ __global__ void indexCopyLargeIndex(TensorInfo<IndexType> dst, // Lua indices begin at 1 IndexType dstIndex = - indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1; + indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1; if (dstIndex < dstCopyDimSize) { IndexType dstOffset = - IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst); + IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst); dstOffset += dstIndex * dst.strides[dstCopyDim]; IndexType srcOffset = - IndexToOffset<IndexType, SrcDim>::get(elementInSlice, src); + IndexToOffset<float, IndexType, SrcDim>::get(elementInSlice, src); srcOffset += srcIndex * src.strides[srcCopyDim]; dst.data[dstOffset] = src.data[srcOffset]; @@ -138,13 +138,13 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; #define SMALL_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ - indexCopySmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \ + indexCopySmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \ <<<smallIndexGrid, smallIndexBlock, 0, stream>>>( \ dstInfo, srcInfo, indicesInfo, \ dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize); #define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ - indexCopyLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \ + indexCopyLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \ <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \ dstInfo, srcInfo, indicesInfo, \ dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize); @@ -155,18 +155,21 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, 128L), (long)(mpc * 8))); dim3 largeIndexBlock(std::min(srcTotalSize, 128L)); - if (THC_canUse32BitIndexMath(state, dst) && - THC_canUse32BitIndexMath(state, src) && - THC_canUse32BitIndexMath(state, indices)) { - TensorInfo<unsigned int> dstInfo(state, dst); + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) { + TensorInfo<float, unsigned int> dstInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, dst); int dstCopyDim = dstInfo.collapseDims(dim); - dstInfo.sizes[dstCopyDim] = 1; + dstInfo.reduceDim(dstCopyDim); - TensorInfo<unsigned int> srcInfo(state, src); + TensorInfo<float, unsigned int> srcInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, src); int srcCopyDim = srcInfo.collapseDims(dim); - srcInfo.sizes[srcCopyDim] = 1; + srcInfo.reduceDim(srcCopyDim); - TensorInfo<unsigned int> indicesInfo(state, indices); + TensorInfo<float, unsigned int> indicesInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, indices); indicesInfo.collapseDims(); // A reasonable choice for when to have each thread iterate over @@ -193,15 +196,18 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT } } } else { - TensorInfo<unsigned long> dstInfo(state, dst); + TensorInfo<float, unsigned long> dstInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, dst); int dstCopyDim = dstInfo.collapseDims(dim); - dstInfo.sizes[dstCopyDim] = 1; + dstInfo.reduceDim(dstCopyDim); - TensorInfo<unsigned long> srcInfo(state, src); + TensorInfo<float, unsigned long> srcInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, src); int srcCopyDim = srcInfo.collapseDims(dim); - srcInfo.sizes[srcCopyDim] = 1; + srcInfo.reduceDim(srcCopyDim); - TensorInfo<unsigned long> indicesInfo(state, indices); + TensorInfo<float, unsigned long> indicesInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, indices); indicesInfo.collapseDims(); LARGE_INDEX(unsigned long, -1, -1, -1); @@ -218,13 +224,13 @@ void THCudaTensor_indexCopy(THCState *state, THCudaTensor *dst, int dim, THCudaT // indexAddLargeIndex kernel is a better choice to increase // parallelism. template <typename IndexType, int DstDim, int SrcDim, int IdxDim> -__global__ void indexAddSmallIndex(TensorInfo<IndexType> dst, - TensorInfo<IndexType> src, - TensorInfo<IndexType> indices, - int dstAddDim, - int srcAddDim, - IndexType innerSize, - long dstAddDimSize) { +__global__ void indexAddSmallIndex(TensorInfo<float, IndexType> dst, + TensorInfo<float, IndexType> src, + TensorInfo<float, IndexType> indices, + int dstAddDim, + int srcAddDim, + IndexType innerSize, + long dstAddDimSize) { // In order to avoid reloading the index that we are copying, load // it once to handle all of the points that are being selected, so // it can be reused as much as possible. This kernel is chosen when @@ -233,7 +239,7 @@ __global__ void indexAddSmallIndex(TensorInfo<IndexType> dst, for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) { // Lua indices begin at 1 IndexType dstIndex = - indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1; + indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1; if (dstIndex < dstAddDimSize) { // We stride over the output ignoring the indexed dimension @@ -242,11 +248,11 @@ __global__ void indexAddSmallIndex(TensorInfo<IndexType> dst, linearIndex < innerSize; linearIndex += gridDim.x * blockDim.x) { IndexType dstOffset = - IndexToOffset<IndexType, DstDim>::get(linearIndex, dst); + IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst); dstOffset += dstIndex * dst.strides[dstAddDim]; IndexType srcOffset = - IndexToOffset<IndexType, SrcDim>::get(linearIndex, src); + IndexToOffset<float, IndexType, SrcDim>::get(linearIndex, src); srcOffset += srcIndex * src.strides[srcAddDim]; atomicAdd(&dst.data[dstOffset], src.data[srcOffset]); @@ -262,32 +268,32 @@ __global__ void indexAddSmallIndex(TensorInfo<IndexType> dst, // indexAddSmallIndex kernel is a better choice to reduce memory // accesses. template <typename IndexType, int DstDim, int SrcDim, int IdxDim> -__global__ void indexAddLargeIndex(TensorInfo<IndexType> dst, - TensorInfo<IndexType> src, - TensorInfo<IndexType> indices, - int dstAddDim, - int srcAddDim, - IndexType innerSize, - long dstAddDimSize) { +__global__ void indexAddLargeIndex(TensorInfo<float, IndexType> dst, + TensorInfo<float, IndexType> src, + TensorInfo<float, IndexType> indices, + int dstAddDim, + int srcAddDim, + IndexType innerSize, + long dstAddDimSize) { // We stride over the output including the indexed dimension // (totalSize), and calculate the destination index point based on that for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; - linearIndex < innerSize * indices.sizes[0]; + linearIndex < innerSize * indices.sizes[0]; linearIndex += gridDim.x * blockDim.x) { IndexType srcIndex = linearIndex / innerSize; IndexType elementInSlice = linearIndex % innerSize; // Lua indices begin at 1 IndexType dstIndex = - indices.data[IndexToOffset<IndexType, IdxDim>::get(srcIndex, indices)] - 1; + indices.data[IndexToOffset<float, IndexType, IdxDim>::get(srcIndex, indices)] - 1; if (dstIndex < dstAddDimSize) { IndexType dstOffset = - IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst); + IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst); dstOffset += dstIndex * dst.strides[dstAddDim]; IndexType srcOffset = - IndexToOffset<IndexType, SrcDim>::get(elementInSlice, src); + IndexToOffset<float, IndexType, SrcDim>::get(elementInSlice, src); srcOffset += srcIndex * src.strides[srcAddDim]; atomicAdd(&dst.data[dstOffset], src.data[srcOffset]); @@ -339,16 +345,16 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; -#define SMALL_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ - indexAddSmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \ - <<<smallIndexGrid, smallIndexBlock, 0, stream>>>( \ - dstInfo, srcInfo, indicesInfo, \ +#define SMALL_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ + indexAddSmallIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \ + <<<smallIndexGrid, smallIndexBlock, 0, stream>>>( \ + dstInfo, srcInfo, indicesInfo, \ dstAddDim, srcAddDim, sliceSize, dstAddDimSize); -#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ - indexAddLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \ - <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \ - dstInfo, srcInfo, indicesInfo, \ +#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ + indexAddLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \ + <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \ + dstInfo, srcInfo, indicesInfo, \ dstAddDim, srcAddDim, sliceSize, dstAddDimSize); dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8))); @@ -357,18 +363,21 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, 128L), (long)(mpc * 8))); dim3 largeIndexBlock(std::min(srcTotalSize, 128L)); - if (THC_canUse32BitIndexMath(state, dst) && - THC_canUse32BitIndexMath(state, src) && - THC_canUse32BitIndexMath(state, indices)) { - TensorInfo<unsigned int> dstInfo(state, dst); + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) { + TensorInfo<float, unsigned int> dstInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, dst); int dstAddDim = dstInfo.collapseDims(dim); - dstInfo.sizes[dstAddDim] = 1; + dstInfo.reduceDim(dstAddDim); - TensorInfo<unsigned int> srcInfo(state, src); + TensorInfo<float, unsigned int> srcInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, src); int srcAddDim = srcInfo.collapseDims(dim); - srcInfo.sizes[srcAddDim] = 1; + srcInfo.reduceDim(srcAddDim); - TensorInfo<unsigned int> indicesInfo(state, indices); + TensorInfo<float, unsigned int> indicesInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, indices); indicesInfo.collapseDims(); // A reasonable choice for when to have each thread iterate over @@ -395,15 +404,18 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe } } } else { - TensorInfo<unsigned long> dstInfo(state, dst); + TensorInfo<float, unsigned long> dstInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, dst); int dstAddDim = dstInfo.collapseDims(dim); - dstInfo.sizes[dstAddDim] = 1; + dstInfo.reduceDim(dstAddDim); - TensorInfo<unsigned long> srcInfo(state, src); + TensorInfo<float, unsigned long> srcInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, src); int srcAddDim = srcInfo.collapseDims(dim); - srcInfo.sizes[srcAddDim] = 1; + srcInfo.reduceDim(srcAddDim); - TensorInfo<unsigned long> indicesInfo(state, indices); + TensorInfo<float, unsigned long> indicesInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, indices); indicesInfo.collapseDims(); LARGE_INDEX(unsigned long, -1, -1, -1); @@ -420,12 +432,12 @@ void THCudaTensor_indexAdd(THCState *state, THCudaTensor *dst, int dim, THCudaTe // indexFillLargeIndex kernel is a better choice to increase // parallelism. template <typename IndexType, int DstDim, int IdxDim> -__global__ void indexFillSmallIndex(TensorInfo<IndexType> dst, - TensorInfo<IndexType> indices, - int dstFillDim, - IndexType innerSize, - long dstFillDimSize, - float val) { +__global__ void indexFillSmallIndex(TensorInfo<float, IndexType> dst, + TensorInfo<float, IndexType> indices, + int dstFillDim, + IndexType innerSize, + long dstFillDimSize, + float val) { // In order to avoid reloading the index that we are copying, load // it once to handle all of the points that are being selected, so // it can be reused as much as possible. This kernel is chosen when @@ -434,7 +446,7 @@ __global__ void indexFillSmallIndex(TensorInfo<IndexType> dst, for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) { // Lua indices begin at 1 IndexType dstIndex_ = - indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1; + indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1; if (dstIndex < dstFillDimSize) { // We stride over the output ignoring the indexed dimension @@ -443,7 +455,7 @@ __global__ void indexFillSmallIndex(TensorInfo<IndexType> dst, linearIndex < innerSize; linearIndex += gridDim.x * blockDim.x) { IndexType dstOffset = - IndexToOffset<IndexType, DstDim>::get(linearIndex, dst); + IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst); dstOffset += dstIndex_ * dst.strides[dstFillDim]; dst.data[dstOffset] = val; @@ -459,27 +471,27 @@ __global__ void indexFillSmallIndex(TensorInfo<IndexType> dst, // indexFillSmallIndex kernel is a better choice to reduce memory // accesses. template <typename IndexType, int DstDim, int IdxDim> -__global__ void indexFillLargeIndex(TensorInfo<IndexType> dst, - TensorInfo<IndexType> indices, - int dstFillDim, - IndexType innerSize, - long dstFillDimSize, - float val) { +__global__ void indexFillLargeIndex(TensorInfo<float, IndexType> dst, + TensorInfo<float, IndexType> indices, + int dstFillDim, + IndexType innerSize, + long dstFillDimSize, + float val) { // We stride over the output including the indexed dimension // (totalSize), and calculate the destination index point based on that for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x; - linearIndex < innerSize * indices.sizes[0]; + linearIndex < innerSize * indices.sizes[0]; linearIndex += gridDim.x * blockDim.x) { IndexType dstIndex = linearIndex / innerSize; IndexType elementInSlice = linearIndex % innerSize; // Lua indices begin at 1 IndexType dstIndex_ = - indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1; + indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1; if (dstIndex_ < dstFillDimSize) { IndexType dstOffset = - IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst); + IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst); dstOffset += dstIndex_ * dst.strides[dstFillDim]; dst.data[dstOffset] = val; @@ -528,16 +540,16 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; -#define SMALL_INDEX(TYPE, DST_DIM, IDX_DIM) \ - indexFillSmallIndex<TYPE, DST_DIM, IDX_DIM> \ - <<<smallIndexGrid, smallIndexBlock, 0, stream>>>( \ - dstInfo, indicesInfo, \ +#define SMALL_INDEX(TYPE, DST_DIM, IDX_DIM) \ + indexFillSmallIndex<TYPE, DST_DIM, IDX_DIM> \ + <<<smallIndexGrid, smallIndexBlock, 0, stream>>>( \ + dstInfo, indicesInfo, \ dstFillDim, sliceSize, dstFillDimSize, val); -#define LARGE_INDEX(TYPE, DST_DIM, IDX_DIM) \ - indexFillLargeIndex<TYPE, DST_DIM, IDX_DIM> \ - <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \ - dstInfo, indicesInfo, \ +#define LARGE_INDEX(TYPE, DST_DIM, IDX_DIM) \ + indexFillLargeIndex<TYPE, DST_DIM, IDX_DIM> \ + <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \ + dstInfo, indicesInfo, \ dstFillDim, sliceSize, dstFillDimSize, val); dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8))); @@ -546,13 +558,15 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, 128L), (long)(mpc * 8))); dim3 largeIndexBlock(std::min(dstTotalSize, 128L)); - if (THC_canUse32BitIndexMath(state, dst) && - THC_canUse32BitIndexMath(state, indices)) { - TensorInfo<unsigned int> dstInfo(state, dst); + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) { + TensorInfo<float, unsigned int> dstInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, dst); int dstFillDim = dstInfo.collapseDims(dim); - dstInfo.sizes[dstFillDim] = 1; + dstInfo.reduceDim(dstFillDim); - TensorInfo<unsigned int> indicesInfo(state, indices); + TensorInfo<float, unsigned int> indicesInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, indices); indicesInfo.collapseDims(); // A reasonable choice for when to have each thread iterate over @@ -579,11 +593,13 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT } } } else { - TensorInfo<unsigned long> dstInfo(state, dst); + TensorInfo<float, unsigned long> dstInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, dst); int dstFillDim = dstInfo.collapseDims(dim); - dstInfo.sizes[dstFillDim] = 1; + dstInfo.reduceDim(dstFillDim); - TensorInfo<unsigned long> indicesInfo(state, indices); + TensorInfo<float, unsigned long> indicesInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, indices); indicesInfo.collapseDims(); LARGE_INDEX(unsigned long, -1, -1); @@ -600,9 +616,9 @@ void THCudaTensor_indexFill(THCState *state, THCudaTensor *dst, int dim, THCudaT // indexSelectLargeIndex kernel is a better choice to increase // parallelism. template <typename IndexType, int DstDim, int SrcDim, int IdxDim> -__global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst, - TensorInfo<IndexType> src, - TensorInfo<IndexType> indices, +__global__ void indexSelectSmallIndex(TensorInfo<float, IndexType> dst, + TensorInfo<float, IndexType> src, + TensorInfo<float, IndexType> indices, int dstSelectDim, int srcSelectDim, IndexType innerSize, @@ -615,7 +631,7 @@ __global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst, for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) { // Lua indices begin at 1 IndexType srcIndex = - indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1; + indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1; if (srcIndex < srcSelectDimSize) { // We stride over the output ignoring the indexed dimension @@ -624,11 +640,11 @@ __global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst, linearIndex < innerSize; linearIndex += gridDim.x * blockDim.x) { IndexType dstOffset = - IndexToOffset<IndexType, DstDim>::get(linearIndex, dst); + IndexToOffset<float, IndexType, DstDim>::get(linearIndex, dst); dstOffset += dstIndex * dst.strides[dstSelectDim]; IndexType srcOffset = - IndexToOffset<IndexType, SrcDim>::get(linearIndex, src); + IndexToOffset<float, IndexType, SrcDim>::get(linearIndex, src); srcOffset += srcIndex * src.strides[srcSelectDim]; dst.data[dstOffset] = src.data[srcOffset]; @@ -644,9 +660,9 @@ __global__ void indexSelectSmallIndex(TensorInfo<IndexType> dst, // indexSelectSmallIndex kernel is a better choice to reduce memory // accesses. template <typename IndexType, int DstDim, int SrcDim, int IdxDim> -__global__ void indexSelectLargeIndex(TensorInfo<IndexType> dst, - TensorInfo<IndexType> src, - TensorInfo<IndexType> indices, +__global__ void indexSelectLargeIndex(TensorInfo<float, IndexType> dst, + TensorInfo<float, IndexType> src, + TensorInfo<float, IndexType> indices, int dstSelectDim, int srcSelectDim, IndexType totalSize, @@ -662,15 +678,15 @@ __global__ void indexSelectLargeIndex(TensorInfo<IndexType> dst, // Lua indices begin at 1 IndexType srcIndex = - indices.data[IndexToOffset<IndexType, IdxDim>::get(dstIndex, indices)] - 1; + indices.data[IndexToOffset<float, IndexType, IdxDim>::get(dstIndex, indices)] - 1; if (srcIndex < srcSelectDimSize) { IndexType dstOffset = - IndexToOffset<IndexType, DstDim>::get(elementInSlice, dst); + IndexToOffset<float, IndexType, DstDim>::get(elementInSlice, dst); dstOffset += dstIndex * dst.strides[dstSelectDim]; IndexType srcOffset = - IndexToOffset<IndexType, SrcDim>::get(elementInSlice, src); + IndexToOffset<float, IndexType, SrcDim>::get(elementInSlice, src); srcOffset += srcIndex * src.strides[srcSelectDim]; dst.data[dstOffset] = src.data[srcOffset]; @@ -734,10 +750,10 @@ void THCudaTensor_indexSelect(THCState *state, THCudaTensor *dst, THCudaTensor * dstInfo, srcInfo, indicesInfo, \ dstSelectDim, srcSelectDim, sliceSize, srcSelectDimSize); -#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ - indexSelectLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \ - <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \ - dstInfo, srcInfo, indicesInfo, \ +#define LARGE_INDEX(TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ + indexSelectLargeIndex<TYPE, DST_DIM, SRC_DIM, IDX_DIM> \ + <<<largeIndexGrid, largeIndexBlock, 0, stream>>>( \ + dstInfo, srcInfo, indicesInfo, \ dstSelectDim, srcSelectDim, dstTotalSize, sliceSize, srcSelectDimSize); dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8))); @@ -746,18 +762,21 @@ void THCudaTensor_indexSelect(THCState *state, THCudaTensor *dst, THCudaTensor * dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, 128L), (long)(mpc * 8))); dim3 largeIndexBlock(std::min(dstTotalSize, 128L)); - if (THC_canUse32BitIndexMath(state, dst) && - THC_canUse32BitIndexMath(state, src) && - THC_canUse32BitIndexMath(state, indices)) { - TensorInfo<unsigned int> dstInfo(state, dst); + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, dst) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) { + TensorInfo<float, unsigned int> dstInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, dst); int dstSelectDim = dstInfo.collapseDims(dim); - dstInfo.sizes[dstSelectDim] = 1; + dstInfo.reduceDim(dstSelectDim); - TensorInfo<unsigned int> srcInfo(state, src); + TensorInfo<float, unsigned int> srcInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, src); int srcSelectDim = srcInfo.collapseDims(dim); - srcInfo.sizes[srcSelectDim] = 1; + srcInfo.reduceDim(srcSelectDim); - TensorInfo<unsigned int> indicesInfo(state, indices); + TensorInfo<float, unsigned int> indicesInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, indices); indicesInfo.collapseDims(); // A reasonable choice for when to have each thread iterate over @@ -784,15 +803,18 @@ void THCudaTensor_indexSelect(THCState *state, THCudaTensor *dst, THCudaTensor * } } } else { - TensorInfo<unsigned long> dstInfo(state, dst); + TensorInfo<float, unsigned long> dstInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, dst); int dstSelectDim = dstInfo.collapseDims(dim); - dstInfo.sizes[dstSelectDim] = 1; + dstInfo.reduceDim(dstSelectDim); - TensorInfo<unsigned long> srcInfo(state, src); + TensorInfo<float, unsigned long> srcInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, src); int srcSelectDim = srcInfo.collapseDims(dim); - srcInfo.sizes[srcSelectDim] = 1; + srcInfo.reduceDim(srcSelectDim); - TensorInfo<unsigned long> indicesInfo(state, indices); + TensorInfo<float, unsigned long> indicesInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, indices); indicesInfo.collapseDims(); LARGE_INDEX(unsigned long, -1, -1, -1); diff --git a/lib/THC/THCTensorInfo.cuh b/lib/THC/THCTensorInfo.cuh new file mode 100644 index 0000000..5347116 --- /dev/null +++ b/lib/THC/THCTensorInfo.cuh @@ -0,0 +1,279 @@ +#ifndef THC_TENSOR_INFO_INC +#define THC_TENSOR_INFO_INC + +#include <cuda.h> +#include <assert.h> +#include "THCGeneral.h" +#include "THCTensor.h" + +// Maximum number of dimensions allowed for cutorch +#define MAX_CUTORCH_DIMS 25 + +// Warning string for tensor arguments that are too large or have too +// many dimensions +#define CUTORCH_STR(X) #X +#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \ + CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions" + +// CUDA kernel argument that defines tensor layout +template <typename T, typename IndexType> +struct TensorInfo { + TensorInfo(T* p, + int dim, + IndexType sz[MAX_CUTORCH_DIMS], + IndexType st[MAX_CUTORCH_DIMS]); + + // Set the size of the given dimension to 1, as if it were a + // reduction dim (allows you to calculate offsets of the reduction + // slice) + void reduceDim(int dim); + + // Collapses all runs of successive dimensions if the size/strides + // match up within the run and there are no holes between the + // dimensions. + // If excludeDim is set (not -1), then excludeDim will not be + // collapsed with any other dimension. + // Function returns the new dimension index that excludeDim maps to, + // since the collapsed dimensions are <= the input dimensions. + int collapseDims(int excludeDim = -1); + + // Contiguous tensors of more than one dimension are collapsed down + // to one tensor + __host__ __device__ inline bool isContiguous() const { + return (dims == 1 && strides[0] == 1); + } + + T* data; + IndexType sizes[MAX_CUTORCH_DIMS]; + IndexType strides[MAX_CUTORCH_DIMS]; + int dims; +}; + +template <typename T, typename IndexType> +TensorInfo<T, IndexType>::TensorInfo(T* p, + int dim, + IndexType sz[MAX_CUTORCH_DIMS], + IndexType st[MAX_CUTORCH_DIMS]) { + data = p; + dims = dim; + assert(dims > 0 && dims < MAX_CUTORCH_DIMS); + + for (int i = 0; i < dim; ++i) { + sizes[i] = sz[i]; + strides[i] = st[i]; + } +} + +template <typename T, typename IndexType> +void +TensorInfo<T, IndexType>::reduceDim(int dim) { + assert(dim < dims && dim >= 0); + sizes[dim] = 1; +} + +template <typename T, typename IndexType> +int +TensorInfo<T, IndexType>::collapseDims(int excludeDim) { + // Find the innermost dimension not of size 1, since dimensions of size 1 are + // collapsible. + int firstNonOneDim = -1; + + for (int i = dims - 1; i >= 0; --i) { + if (i == excludeDim) { + // We cannot collapse this dimension, even if it is size 1 + firstNonOneDim = i; + break; + } + + if (sizes[i] != 1) { + firstNonOneDim = i; + break; + } + } + + // Special case: if all dimensions are of size 1, then this is a + // single-point tensor that we still have to operate on. Reduce to a + // single point. + if (firstNonOneDim == -1) { + assert(excludeDim == -1); + + dims = 1; + sizes[0] = 1; + strides[0] = 1; + + // Everything effectively got collapsed into this dimension + return 0; + } + + // Count the number of successive dimensions that can be collapsed, from + // innermost to outermost. + int numCollapsed = 0; + + // Skip the leading size 1 dims + numCollapsed += dims - 1 - firstNonOneDim; + + // We perform one pass through to determine how many dimensions we + // can collapse, before calculating the actual size of the collapsed + // dimensions. + // size/strideInner are the size/strides of the previous inner + // non-collapsible dim we encounter. + long sizeInner = sizes[firstNonOneDim]; + long strideInner = strides[firstNonOneDim]; + + for (int i = firstNonOneDim - 1; i >= 0; --i) { + long sizeOuter = sizes[i]; + long strideOuter = strides[i]; + + // Don't collapse this dimension if we want to exclude it from + // collapsing. + // Since this code is attempting to collapse a subsequent + // dimension (i) with the preceding dimension (i + 1), we can only + // perform collapsing if the preceding dimension can be collapsed + // (i.e., not excludeDim) + if ((excludeDim != i) && (excludeDim != i + 1)) { + // The next outermost dimension can be skipped if size 1 + if (sizeOuter == 1) { + ++numCollapsed; + continue; + } + + // If the next outermost dimension is contiguous with the + // previous non-collapsed one, collapse it + if (strideOuter == strideInner * sizeInner) { + ++numCollapsed; + + // This is the run of collapsed dimensions' size + sizeInner = sizeInner * sizeOuter; + continue; + } + } + + // Otherwise, this new outer dimension at `i` cannot be collapsed + // because it is excluded from collapsing, or it is not contiguous + // with the previous inner dimension. + sizeInner = sizeOuter; + strideInner = strideOuter; + } + + // This will be our new size/stride and dimension. + IndexType newSizes[MAX_CUTORCH_DIMS]; + IndexType newStrides[MAX_CUTORCH_DIMS]; + + assert(numCollapsed < dims); + int newDims = dims - numCollapsed; + + // We return the index of the excluded dimension that is excluded + // from being collapsed here. + int returnDim = -1; + + // We perform a second pass through the dimensions to actually + // calculate the size of the collapsed dimensions. + int collapsedIndex = dims - numCollapsed - 1; + newSizes[collapsedIndex] = sizes[firstNonOneDim]; + newStrides[collapsedIndex] = strides[firstNonOneDim]; + + if (firstNonOneDim == excludeDim) { + returnDim = collapsedIndex; + } + + for (int i = firstNonOneDim - 1; i >= 0; --i) { + IndexType sizeOuter = sizes[i]; + IndexType strideOuter = strides[i]; + + if ((excludeDim != i) && (excludeDim != i + 1)) { + if (sizeOuter == 1) { + // skip + continue; + } + + if (strideOuter == newSizes[collapsedIndex] * newStrides[collapsedIndex]) { + // collapse + newSizes[collapsedIndex] *= sizeOuter; + continue; + } + } + + // Otherwise, strides don't match, or dim `i` is excluded from + // collapsing. + --collapsedIndex; + assert(collapsedIndex >= 0); + assert(collapsedIndex < newDims); + newSizes[collapsedIndex] = sizeOuter; + newStrides[collapsedIndex] = strideOuter; + + if (excludeDim == i) { + returnDim = collapsedIndex; + } + } + + // We must have filled all the dimensions we're looking for + assert(collapsedIndex == 0); + assert((excludeDim == -1) || (returnDim != -1)); + + dims = newDims; + + for (int i = 0; i < dims; ++i) { + sizes[i] = newSizes[i]; + strides[i] = newStrides[i]; + } + + // After collapsing, the original `excludeDim` may have been + // renumbered to this new `returnDim`, since some dimensions could + // have been collapsed. + return returnDim; +} + +// Translate a linear index for the apply to a T* offset; +// specialized on `Dims` to reduce nvcc compilation time +template <typename T, typename IndexType, int Dims> +struct IndexToOffset { + static __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo<T, IndexType>& info) { + IndexType offset = 0; + + // Use static dims + for (int i = Dims - 1; i >= 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + + if (i > 0) { + linearId /= info.sizes[i]; + } + } + + return offset; + } +}; + +template <typename T, typename IndexType> +struct IndexToOffset<T, IndexType, -2> { + static inline __host__ __device__ IndexType + get(IndexType linearId, const TensorInfo<T, IndexType>& info) { + return linearId; + } +}; + +template <typename T, typename IndexType> +struct IndexToOffset<T, IndexType, -1> { + static inline __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo<T, IndexType>& info) { + + IndexType offset = 0; + + // Use dynamic dims + for (int i = info.dims - 1; i >= 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + + linearId /= info.sizes[i]; + } + + return offset; + } +}; + +#endif // THC_TENSOR_INFO_INC diff --git a/lib/THC/THCTensorMasked.cu b/lib/THC/THCTensorMasked.cu index ba69425..6ee5bad 100644 --- a/lib/THC/THCTensorMasked.cu +++ b/lib/THC/THCTensorMasked.cu @@ -33,7 +33,7 @@ void THCudaTensor_maskedFill(THCState* state, THCudaTensor_nElement(state, mask), 2, "sizes do not match"); - if (!THCudaTensor_pointwiseApply2(state, tensor, mask, TensorMaskedFillOp(value))) { + if (!THC_pointwiseApply2(state, tensor, mask, TensorMaskedFillOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } @@ -106,7 +106,7 @@ void THCudaTensor_maskedCopy(THCState* state, // update `tensor` where `mask` == 1 but pull from `src` at // maskPrefixSum - bool status = THCudaTensor_pointwiseApply3( + bool status = THC_pointwiseApply3( state, tensor, contigMask, maskPrefixSum, TensorMaskedCopyOp(THCudaTensor_data(state, contigSrc))); @@ -169,7 +169,7 @@ void THCudaTensor_maskedSelect(THCState* state, maskPrefixSumData); // Then copy over the masked elements at their desired output index - bool status = THCudaTensor_pointwiseApply3( + bool status = THC_pointwiseApply3( state, contigMask, maskPrefixSum, src, TensorMaskedSelectOp(THCudaTensor_data(state, tensor))); diff --git a/lib/THC/THCTensorMath.cu b/lib/THC/THCTensorMath.cu index 0690edf..5e140ef 100644 --- a/lib/THC/THCTensorMath.cu +++ b/lib/THC/THCTensorMath.cu @@ -9,66 +9,6 @@ #include <thrust/functional.h> #include <cfloat> -struct TensorFillOp { - TensorFillOp(float v) : val(v) {} - __device__ __forceinline__ void operator()(float* v) { *v = val; } - - const float val; -}; - -void THCudaTensor_fill(THCState* state, THCudaTensor *self_, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 1, self_)); - if (!THCudaTensor_pointwiseApply1(state, self_, TensorFillOp(value))) { - THArgCheck(false, 1, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); -} - -void THCudaTensor_zero(THCState *state, THCudaTensor *self_) -{ - THAssert(THCudaTensor_checkGPU(state, 1, self_)); - if (THCudaTensor_isContiguous(state, self_)) { - THCudaCheck(cudaMemsetAsync(THCudaTensor_data(state, self_), - 0, - sizeof(float) * THCudaTensor_nElement(state, self_), - THCState_getCurrentStream(state))); - } else { - if (!THCudaTensor_pointwiseApply1(state, self_, TensorFillOp(0))) { - THArgCheck(false, 1, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -} - -void THCudaTensor_zeros(THCState *state, THCudaTensor *r_, THLongStorage *size) -{ - THAssert(THCudaTensor_checkGPU(state, 1, r_)); - THCudaTensor_resize(state, r_, size, NULL); - THCudaTensor_zero(state, r_); -} - -void THCudaTensor_ones(THCState *state, THCudaTensor *r_, THLongStorage *size) -{ - THAssert(THCudaTensor_checkGPU(state, 1, r_)); - THCudaTensor_resize(state, r_, size, NULL); - THCudaTensor_fill(state, r_, 1); -} - -void THCudaTensor_reshape(THCState *state, THCudaTensor *r_, THCudaTensor *t, THLongStorage *size) -{ - THAssert(THCudaTensor_checkGPU(state, 2, r_, t)); - THCudaTensor_resize(state, r_, size, NULL); - THCudaTensor_copy(state, r_, t); -} - -long THCudaTensor_numel(THCState *state, THCudaTensor *t) -{ - return THCudaTensor_nElement(state, t); -} - void THCudaTensor_cat(THCState *state, THCudaTensor *result, THCudaTensor *ta, THCudaTensor *tb, int dimension) { THCudaTensor* inputs[2]; @@ -138,74 +78,6 @@ void THCudaTensor_catArray(THCState *state, THCudaTensor *result, THCudaTensor * } } -struct TensorCPowOp { - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = powf(*out, *in); - } - - __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { - *out = powf(*in1, *in2); - } -}; - -void THCudaTensor_cpow(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THArgCheck(THCudaTensor_nElement(state, src1) == - THCudaTensor_nElement(state, src2), 3, "sizes do not match"); - - if (self_ == src1) { - // self = pow(self, src2) - if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorCPowOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCudaTensor_resizeAs(state, self_, src1); - - // self = pow(src1, src2) - if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorCPowOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -} - -struct TensorDivOp { - __device__ __forceinline__ void - operator()(float* out, float* in) { - *out /= *in; - } - - __device__ __forceinline__ void - operator()(float* out, float* in1, float* in2) { - *out = *in1 / *in2; - } -}; - -void THCudaTensor_cdiv(THCState* state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THArgCheck(THCudaTensor_nElement(state, src1) == - THCudaTensor_nElement(state, src2), 3, "sizes do not match"); - - if (self_ == src1) { - // self *= src2 - if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorDivOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCudaTensor_resizeAs(state, self_, src1); - - // self = src1 * src2 - if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorDivOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -} - struct TensorAddCMulOp { TensorAddCMulOp(float v) : val(v) {} @@ -234,7 +106,7 @@ void THCudaTensor_addcmul(THCState *state, THCudaTensor *self_, THCudaTensor *t, THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2), 3, "sizes do not match"); - if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorAddCMulOp(value))) { + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddCMulOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } @@ -268,7 +140,7 @@ void THCudaTensor_addcdiv(THCState *state, THCudaTensor *self_, THCudaTensor *t, THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2), 3, "sizes do not match"); - if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorAddCDivOp(value))) { + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddCDivOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } @@ -400,3 +272,14 @@ int THCudaTensor_logicalany(THCState *state, THCudaTensor *self) { return (int) result; } + +template <typename T> +struct TensorFillOp { + TensorFillOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* v) { *v = val; } + + const T val; +}; + +#include "generic/THCTensorMath.cu" +#include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h index e31cb2a..6593da2 100644 --- a/lib/THC/THCTensorMath.h +++ b/lib/THC/THCTensorMath.h @@ -4,28 +4,19 @@ #include "THCTensor.h" #include "THCGeneral.h" -THC_API void THCudaTensor_fill(THCState *state, THCudaTensor *self, float value); -THC_API void THCudaTensor_zero(THCState *state, THCudaTensor *self); +#include "generic/THCTensorMath.h" +#include "THCGenerateAllTypes.h" -THC_API void THCudaTensor_zeros(THCState *state, THCudaTensor *r_, THLongStorage *size); -THC_API void THCudaTensor_ones(THCState *state, THCudaTensor *r_, THLongStorage *size); -THC_API void THCudaTensor_reshape(THCState *state, THCudaTensor *r_, THCudaTensor *t, THLongStorage *size); -THC_API long THCudaTensor_numel(THCState *state, THCudaTensor *t); +#include "generic/THCTensorMathPairwise.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathPointwise.h" +#include "THCGenerateAllTypes.h" -THC_API void THCudaTensor_add(THCState *state, THCudaTensor *self, THCudaTensor *src, float value); -THC_API void THCudaTensor_sub(THCState *state, THCudaTensor *self, THCudaTensor *src, float value); -THC_API void THCudaTensor_mul(THCState *state, THCudaTensor *self, THCudaTensor *src, float value); -THC_API void THCudaTensor_div(THCState *state, THCudaTensor *self, THCudaTensor *src, float value); THC_API void THCudaTensor_tril(THCState *state, THCudaTensor *self, THCudaTensor *src, long k); THC_API void THCudaTensor_triu(THCState *state, THCudaTensor *self, THCudaTensor *src, long k); -THC_API void THCudaTensor_cadd(THCState *state, THCudaTensor *self, THCudaTensor *src1, float value, THCudaTensor *src2); -THC_API void THCudaTensor_csub(THCState *state, THCudaTensor *self, THCudaTensor *src1, float value, THCudaTensor *src2); -THC_API void THCudaTensor_cmul(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor *src2); -THC_API void THCudaTensor_cpow(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor *src2); -THC_API void THCudaTensor_cdiv(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor *src2); - THC_API void THCudaTensor_addcmul(THCState *state, THCudaTensor *self, THCudaTensor* t, float value, THCudaTensor *src1, THCudaTensor *src2); THC_API void THCudaTensor_addcdiv(THCState *state, THCudaTensor *self, THCudaTensor* t, float value, THCudaTensor *src1, THCudaTensor *src2); diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu index 2cc2adb..0d6bb52 100644 --- a/lib/THC/THCTensorMath2.cu +++ b/lib/THC/THCTensorMath2.cu @@ -31,13 +31,13 @@ void THCudaTensor_pow(THCState *state, THCudaTensor *self_, THCudaTensor *src, f { THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); if (self_ == src) { - if (!THCudaTensor_pointwiseApply1(state, self_, TensorPowOp(value))) { + if (!THC_pointwiseApply1(state, self_, TensorPowOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self_, src); - if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorPowOp(value))) { + if (!THC_pointwiseApply2(state, self_, src, TensorPowOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -63,13 +63,13 @@ void THCudaTensor_tpow(THCState *state, THCudaTensor *self_, float value, THCuda { THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); if (self_ == src) { - if (!THCudaTensor_pointwiseApply1(state, self_, TensorTPowOp(value))) { + if (!THC_pointwiseApply1(state, self_, TensorTPowOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self_, src); - if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorTPowOp(value))) { + if (!THC_pointwiseApply2(state, self_, src, TensorTPowOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -90,7 +90,7 @@ void THCudaTensor_atan2(THCState *state, THCudaTensor *self_, THCudaTensor *tx, THCudaTensor_nElement(state, ty), 3, "sizes do not match"); THCudaTensor_resizeAs(state, self_, tx); - if (!THCudaTensor_pointwiseApply3(state, self_, tx, ty, TensorATan2Op())) { + if (!THC_pointwiseApply3(state, self_, tx, ty, TensorATan2Op())) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } @@ -116,13 +116,13 @@ void THCudaTensor_clamp(THCState *state, THCudaTensor *self_, THCudaTensor *src, { THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); if (self_ == src) { - if (!THCudaTensor_pointwiseApply1(state, self_, TensorClampOp(min_value, max_value))) { + if (!THC_pointwiseApply1(state, self_, TensorClampOp(min_value, max_value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self_, src); - if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorClampOp(min_value, max_value))) { + if (!THC_pointwiseApply2(state, self_, src, TensorClampOp(min_value, max_value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -146,13 +146,13 @@ void THCudaTensor_sign(THCState *state, THCudaTensor *self_, THCudaTensor *src) { THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); if (self_ == src) { - if (!THCudaTensor_pointwiseApply1(state, self_, TensorSignOp())) { + if (!THC_pointwiseApply1(state, self_, TensorSignOp())) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self_, src); - if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorSignOp())) { + if (!THC_pointwiseApply2(state, self_, src, TensorSignOp())) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -192,7 +192,7 @@ void THCudaTensor_lerp(THCState *state, THCudaTensor *result, THCudaTensor *a, T THCudaTensor_nElement(state, b), 3, "sizes do not match"); THCudaTensor_resizeAs(state, result, a); - if (!THCudaTensor_pointwiseApply3(state, result, a, b, TensorLerpOp(w))) { + if (!THC_pointwiseApply3(state, result, a, b, TensorLerpOp(w))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } @@ -699,7 +699,7 @@ THC_API void THCudaTensor_cross(THCState *state, THCudaTensor *self, THCudaTenso THCudaTensor *nx = THCudaTensor_newNarrow(state, x, dimension, 0, 1); THCudaTensor *ny = THCudaTensor_newNarrow(state, y, dimension, 0, 1); THCudaTensor *nself = THCudaTensor_newNarrow(state, self, dimension, 0, 1); - if (!THCudaTensor_pointwiseApply3(state, nself, nx, ny, TensorCrossOp(sx, sy, so))) { + if (!THC_pointwiseApply3(state, nself, nx, ny, TensorCrossOp(sx, sy, so))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } THCudaTensor_free(state, nx); diff --git a/lib/THC/THCTensorMathCompare.cu b/lib/THC/THCTensorMathCompare.cu index 0f7d979..71cfdd5 100644 --- a/lib/THC/THCTensorMathCompare.cu +++ b/lib/THC/THCTensorMathCompare.cu @@ -11,7 +11,7 @@ void THCudaTensor_logicalValue(THCState *state, THCudaTensor *self_, THCudaTenso { THCudaTensor_resizeAs(state, self_, src); - if (!THCudaTensor_pointwiseApply2(state, self_, src, op)) { + if (!THC_pointwiseApply2(state, self_, src, op)) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } diff --git a/lib/THC/THCTensorMathCompareT.cu b/lib/THC/THCTensorMathCompareT.cu index b88f153..0567a89 100644 --- a/lib/THC/THCTensorMathCompareT.cu +++ b/lib/THC/THCTensorMathCompareT.cu @@ -11,7 +11,7 @@ void THCudaTensor_logicalTensor(THCState *state, THCudaTensor *self_, THCudaTens THCudaTensor_resizeAs(state, self_, src1); THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2), 3, "sizes do not match"); - if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, op)) { + if (!THC_pointwiseApply3(state, self_, src1, src2, op)) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu index 383344b..2c081d1 100644 --- a/lib/THC/THCTensorMathPairwise.cu +++ b/lib/THC/THCTensorMathPairwise.cu @@ -1,96 +1,98 @@ #include "THCTensorMath.h" #include "THCGeneral.h" #include "THCBlas.h" +#include "THCHalf.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCReduce.cuh" +template <typename T> struct TensorAddConstantOp { - TensorAddConstantOp(float v) : val(v) {} - __device__ __forceinline__ void operator()(float* out, float* in) { + TensorAddConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { *out = *in + val; } - __device__ __forceinline__ void operator()(float* v) { + __device__ __forceinline__ void operator()(T* v) { *v += val; } - const float val; + const T val; }; -void THCudaTensor_add(THCState *state, THCudaTensor *self_, THCudaTensor *src_, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self_, src_)); - if (self_ == src_) { - if (!THCudaTensor_pointwiseApply1(state, self_, TensorAddConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCudaTensor_resizeAs(state, self_, src_); - - if (!THCudaTensor_pointwiseApply2(state, self_, src_, TensorAddConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorAddConstantOp<half> { + TensorAddConstantOp(half v) : val(v) {} + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*in, val); +#else + float fin = __half2float(*in); + float fval = __half2float(val); + float fout = fin + fval; + *out = __float2half(fout); +#endif } - THCudaCheck(cudaGetLastError()); -} + __device__ __forceinline__ void operator()(half* v) { +#ifdef CUDA_HALF_INSTRUCTIONS + *v = __hadd(*v, val); +#else + float fv = __half2float(*v); + float fval = __half2float(val); + fv += fval; + *v = __float2half(fv); +#endif + } -void THCudaTensor_sub(THCState *state, THCudaTensor *self_, THCudaTensor *src_, float value) -{ - THCudaTensor_add(state, self_, src_, -value); -} + const half val; +}; +#endif // CUDA_HALF_TENSOR +template <typename T> struct TensorMulConstantOp { - TensorMulConstantOp(float v) : val(v) {} - __device__ __forceinline__ void operator()(float* out, float* in) { + TensorMulConstantOp(T v) : val(v) {} + __device__ __forceinline__ void operator()(T* out, T* in) { *out = *in * val; } - __device__ __forceinline__ void operator()(float* v) { + __device__ __forceinline__ void operator()(T* v) { *v *= val; } - const float val; + const T val; }; -void THCudaTensor_mul(THCState *state, THCudaTensor *self_, THCudaTensor *src_, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self_, src_)); - if (self_ == src_) { - if (!THCudaTensor_pointwiseApply1(state, self_, TensorMulConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCudaTensor_resizeAs(state, self_, src_); - - if (!THCudaTensor_pointwiseApply2(state, self_, src_, TensorMulConstantOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorMulConstantOp<half> { + TensorMulConstantOp(half v) : val(v) {} + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hmul(*in, val); +#else + float fin = __half2float(*in); + float fval = __half2float(val); + float fout = fin * fval; + *out = __float2half(fout); +#endif } - THCudaCheck(cudaGetLastError()); -} - -void THCudaTensor_div(THCState* state, THCudaTensor *self_, THCudaTensor *src_, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self_, src_)); - THArgCheck(value != 0.0f, 3, "divide by zero"); - - if (self_ == src_) { - if (!THCudaTensor_pointwiseApply1(state, self_, TensorMulConstantOp(1.0f / value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCudaTensor_resizeAs(state, self_, src_); - - if (!THCudaTensor_pointwiseApply2(state, self_, src_, TensorMulConstantOp(1.0f / value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } + __device__ __forceinline__ void operator()(half* v) { +#ifdef CUDA_HALF_INSTRUCTIONS + *v = __hmul(*v, val); +#else + float fv = __half2float(*v); + float fval = __half2float(val); + fv *= fval; + *v = __float2half(fv); +#endif } - THCudaCheck(cudaGetLastError()); -} + const half val; +}; +#endif // CUDA_HALF_TENSOR template <int Upper> struct TensorTriOp { @@ -143,13 +145,13 @@ void THCudaTensor_tril(THCState *state, THCudaTensor *self_, THCudaTensor *src_, TensorTriOp<0> op(start, stride0, stride1, k); if (self_ == src_) { - if (!THCudaTensor_pointwiseApply1(state, src, op)) { + if (!THC_pointwiseApply1(state, src, op)) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self_, src); - if (!THCudaTensor_pointwiseApply2(state, self_, src, op)) { + if (!THC_pointwiseApply2(state, self_, src, op)) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -176,13 +178,13 @@ void THCudaTensor_triu(THCState *state, THCudaTensor *self_, THCudaTensor *src_, TensorTriOp<1> op(start, stride0, stride1, k); if (self_ == src_) { - if (!THCudaTensor_pointwiseApply1(state, src, op)) { + if (!THC_pointwiseApply1(state, src, op)) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self_, src); - if (!THCudaTensor_pointwiseApply2(state, self_, src, op)) { + if (!THC_pointwiseApply2(state, self_, src, op)) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -192,3 +194,6 @@ void THCudaTensor_triu(THCState *state, THCudaTensor *self_, THCudaTensor *src_, THCudaCheck(cudaGetLastError()); } + +#include "generic/THCTensorMathPairwise.cu" +#include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMathPointwise.cu b/lib/THC/THCTensorMathPointwise.cu index 80b1ead..72b16e8 100644 --- a/lib/THC/THCTensorMathPointwise.cu +++ b/lib/THC/THCTensorMathPointwise.cu @@ -1,6 +1,7 @@ #include "THCTensorMath.h" #include "THCGeneral.h" #include "THCBlas.h" +#include "THCHalf.h" #include "THCTensorCopy.h" #include "THCApply.cuh" #include "THCReduce.cuh" @@ -19,13 +20,13 @@ void THCudaTensor_##NAME(THCState* state, THCudaTensor* self_, THCudaTensor* src) { \ THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); \ if (self_ == src) { \ - if (!THCudaTensor_pointwiseApply1(state, self_, Tensor##NAME##Op())) { \ + if (!THC_pointwiseApply1(state, self_, Tensor##NAME##Op())) { \ THArgCheck(false, 2, CUTORCH_DIM_WARNING); \ } \ } else { \ THCudaTensor_resizeAs(state, self_, src); \ \ - if (!THCudaTensor_pointwiseApply2(state, self_, src, Tensor##NAME##Op())) { \ + if (!THC_pointwiseApply2(state, self_, src, Tensor##NAME##Op())) { \ THArgCheck(false, 2, CUTORCH_DIM_WARNING); \ } \ } \ @@ -76,171 +77,13 @@ struct TensorSigmoidOp { void THCudaTensor_sigmoid(THCState* state, THCudaTensor* self_, THCudaTensor* src) { THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); if (self_ == src) { - if (!THCudaTensor_pointwiseApply1(state, self_, TensorSigmoidOp())) { + if (!THC_pointwiseApply1(state, self_, TensorSigmoidOp())) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self_, src); - if (!THCudaTensor_pointwiseApply2(state, self_, src, TensorSigmoidOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - - THCudaCheck(cudaGetLastError()); -} - -struct TensorAddOp { - __device__ __forceinline__ void operator()(float* out, float* in) { - *out += *in; - } - - __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { - *out = *in1 + *in2; - } -}; - -struct TensorCAddOp { - TensorCAddOp(float v) : val(v) {} - - __device__ __forceinline__ void operator()(float* out, float* in) { - *out += val * *in; - } - - __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { - *out = *in1 + val * *in2; - } - - float val; -}; - -void THCudaTensor_cadd(THCState *state, THCudaTensor *self_, THCudaTensor* src1, float value, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THArgCheck(THCudaTensor_nElement(state, src1) == - THCudaTensor_nElement(state, src2), 3, "sizes do not match"); - - if (self_ == src1) { - if (value == 1.0f) { - // self += src2 - if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorAddOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - // self += value * src2 - if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorCAddOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - } else { - THCudaTensor_resizeAs(state, self_, src1); - - if (value == 1.0f) { - // self = src1 + src2 - if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorAddOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - // self = src1 + value * src2 - if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorCAddOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - } - - THCudaCheck(cudaGetLastError()); -} - -struct TensorSubOp { - __device__ __forceinline__ void operator()(float* out, float* in) { - *out -= *in; - } - - __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { - *out = *in1 - *in2; - } -}; - - -struct TensorCSubOp { - TensorCSubOp(float v) : val(v) {} - - __device__ __forceinline__ void operator()(float* out, float* in) { - *out -= val * *in; - } - - __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { - *out = *in1 - val * *in2; - } - - float val; -}; - - -void THCudaTensor_csub(THCState *state, THCudaTensor *self_, THCudaTensor* src1, float value, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THArgCheck(THCudaTensor_nElement(state, src1) == - THCudaTensor_nElement(state, src2), 3, "sizes do not match"); - - if (self_ == src1) { - if (value == 1.0f) { - // self -= src2 - if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorSubOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - // self += -value * src2 - if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorCAddOp(-value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - } else { - THCudaTensor_resizeAs(state, self_, src1); - - if (value == 1.0f) { - // self = src1 - src2 - if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorSubOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - // self = src1 - value * src2 - if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorCAddOp(-value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } - } - - THCudaCheck(cudaGetLastError()); -} - - -struct TensorMulOp { - __device__ __forceinline__ void operator()(float* out, float* in) { - *out *= *in; - } - - __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { - *out = *in1 * *in2; - } -}; - -void THCudaTensor_cmul(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THArgCheck(THCudaTensor_nElement(state, src1) == - THCudaTensor_nElement(state, src2), 3, "sizes do not match"); - - if (self_ == src1) { - // self *= src2 - if (!THCudaTensor_pointwiseApply2(state, self_, src2, TensorMulOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCudaTensor_resizeAs(state, self_, src1); - - // self = src1 * src2 - if (!THCudaTensor_pointwiseApply3(state, self_, src1, src2, TensorMulOp())) { + if (!THC_pointwiseApply2(state, self_, src, TensorSigmoidOp())) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -265,12 +108,12 @@ void THCudaTensor_cmax(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor_nElement(state, src2), 2, "sizes do not match"); if (self == src1) { - if (!THCudaTensor_pointwiseApply2(state, self, src2, TensorMaxOp())) { + if (!THC_pointwiseApply2(state, self, src2, TensorMaxOp())) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self, src1); - if (!THCudaTensor_pointwiseApply3(state, self, src1, src2, TensorMaxOp())) { + if (!THC_pointwiseApply3(state, self, src1, src2, TensorMaxOp())) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -293,12 +136,12 @@ void THCudaTensor_cmin(THCState *state, THCudaTensor *self, THCudaTensor *src1, THCudaTensor_nElement(state, src2), 2, "sizes do not match"); if (self == src1) { - if (!THCudaTensor_pointwiseApply2(state, self, src2, TensorMinOp())) { + if (!THC_pointwiseApply2(state, self, src2, TensorMinOp())) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self, src1); - if (!THCudaTensor_pointwiseApply3(state, self, src1, src2, TensorMinOp())) { + if (!THC_pointwiseApply3(state, self, src1, src2, TensorMinOp())) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -323,12 +166,12 @@ void THCudaTensor_cmaxValue(THCState *state, THCudaTensor *self, THCudaTensor *s THAssert(THCudaTensor_checkGPU(state, 2, self, src)); if (self == src) { - if (!THCudaTensor_pointwiseApply1(state, self, TensorMaxValueOp(value))) { + if (!THC_pointwiseApply1(state, self, TensorMaxValueOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self, src); - if (!THCudaTensor_pointwiseApply2(state, self, src, TensorMaxValueOp(value))) { + if (!THC_pointwiseApply2(state, self, src, TensorMaxValueOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } @@ -353,13 +196,259 @@ void THCudaTensor_cminValue(THCState *state, THCudaTensor *self, THCudaTensor *s THAssert(THCudaTensor_checkGPU(state, 2, self, src)); if (self == src) { - if (!THCudaTensor_pointwiseApply1(state, self, TensorMinValueOp(value))) { + if (!THC_pointwiseApply1(state, self, TensorMinValueOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } else { THCudaTensor_resizeAs(state, self, src); - if (!THCudaTensor_pointwiseApply2(state, self, src, TensorMinValueOp(value))) { + if (!THC_pointwiseApply2(state, self, src, TensorMinValueOp(value))) { THArgCheck(false, 2, CUTORCH_DIM_WARNING); } } } + +template <typename T> +struct TensorAddOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out += *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = *in1 + *in2; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorAddOp<half> { + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*out, *in); +#else + float fout = __half2float(*out); + float fin = __half2float(*in); + fout += fin; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*in1, *in2); +#else + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fout = fin1 + fin2; + *out = __float2half(fout); +#endif + } +}; +#endif // CUDA_HALF_TENSOR + +template <typename T> +struct TensorCAddOp { + TensorCAddOp(T v) : val(v) {} + + __device__ __forceinline__ void operator()(T* out, T* in) { + *out += val * *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = *in1 + val * *in2; + } + + T val; +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorCAddOp<half> { + TensorCAddOp(half v) : val(v) {} + + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*out, __hmul(val, *in)); +#else + float fout = __half2float(*out); + float fval = __half2float(val); + float fin = __half2float(*in); + + fout += fval * fin; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hadd(*in1, __hmul(val, *in2)); +#else + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fval = __half2float(val); + + float fout = fin1 + fval * fin2; + *out = __float2half(fout); +#endif + } + + half val; +}; +#endif // CUDA_HALF_TENSOR + +template <typename T> +struct TensorSubOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out -= *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = *in1 - *in2; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorSubOp<half> { + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hsub(*out, *in); +#else + float fout = __half2float(*out); + float fin = __half2float(*in); + fout -= fin; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hsub(*in1, *in2); +#else + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fout = fin1 - fin2; + *out = __float2half(fout); +#endif + } +}; +#endif // CUDA_HALF_TENSOR + +template <typename T> +struct TensorMulOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out *= *in; + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = *in1 * *in2; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorMulOp<half> { + __device__ __forceinline__ void operator()(half* out, half* in) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hmul(*out, *in); +#else + float fout = __half2float(*out); + float fin = __half2float(*in); + fout *= fin; + *out = __float2half(fout); +#endif + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { +#ifdef CUDA_HALF_INSTRUCTIONS + *out = __hmul(*in1, *in2); +#else + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fout = fin1 * fin2; + *out = __float2half(fout); +#endif + } +}; +#endif // CUDA_HALF_TENSOR + +template <typename T> +struct TensorCPowOp { + __device__ __forceinline__ void operator()(T* out, T* in) { + *out = powf((float) *out, (float) *in); + } + + __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { + *out = powf((float) *in1, (float) *in2); + } +}; + +template <> +struct TensorCPowOp<double> { + __device__ __forceinline__ void operator()(double* out, double* in) { + *out = pow(*out, *in); + } + + __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) { + *out = pow(*in1, *in2); + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorCPowOp<half> { + __device__ __forceinline__ void operator()(half* out, half* in) { + // No fp16 pow function yet + float fout = __half2float(*out); + float fin = __half2float(*in); + fout = powf(fout, fin); + *out = __float2half(fout); + } + + __device__ __forceinline__ void operator()(half* out, half* in1, half* in2) { + // No fp16 pow function yet + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fout = powf(fin1, fin2); + *out = __float2half(fout); + } +}; +#endif // CUDA_HALF_TENSOR + +template <typename T> +struct TensorDivOp { + __device__ __forceinline__ void + operator()(T* out, T* in) { + *out /= *in; + } + + __device__ __forceinline__ void + operator()(T* out, T* in1, T* in2) { + *out = *in1 / *in2; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct TensorDivOp<half> { + __device__ __forceinline__ void + operator()(half* out, half* in) { + // No fp16 div instruction yet + float fout = __half2float(*out); + float fin = __half2float(*in); + fout /= fin; + *out = __float2half(fout); + } + + __device__ __forceinline__ void + operator()(half* out, half* in1, half* in2) { + // No fp16 div instruction yet + float fin1 = __half2float(*in1); + float fin2 = __half2float(*in2); + float fout = fin1 / fin2; + *out = __float2half(fout); + } +}; +#endif // CUDA_HALF_TENSOR + +#include "generic/THCTensorMathPointwise.cu" +#include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorScatterGather.cu b/lib/THC/THCTensorScatterGather.cu index 876f32a..1059b03 100644 --- a/lib/THC/THCTensorScatterGather.cu +++ b/lib/THC/THCTensorScatterGather.cu @@ -2,7 +2,6 @@ #include "THCGeneral.h" #include "THCApply.cuh" - // Compute the offsets into the given tensors for a linear index. For the 't2' // tensor, dimension 'dim' is skipped. The tensors are assumed to have the same // size (with the exception of 't2' in dimension 'dim'). @@ -11,9 +10,9 @@ template <typename IndexType, int Dims> struct IndexToScatterGatherOffsets { static __device__ void compute( IndexType linearId, const int dim, - const TensorInfo<IndexType>& index, IndexType* indexOffset, - const TensorInfo<IndexType>& t1, IndexType* t1Offset, - const TensorInfo<IndexType>& t2, IndexType* t2Offset) { + const TensorInfo<float, IndexType>& index, IndexType* indexOffset, + const TensorInfo<float, IndexType>& t1, IndexType* t1Offset, + const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) { for (int d = Dims - 1; d >= 0; d--) { IndexType curDimIndex = linearId % index.sizes[d]; *indexOffset += curDimIndex * index.strides[d]; @@ -27,8 +26,8 @@ struct IndexToScatterGatherOffsets { static __device__ void compute( IndexType linearId, const int dim, - const TensorInfo<IndexType>& index, IndexType* indexOffset, - const TensorInfo<IndexType>& t2, IndexType* t2Offset) { + const TensorInfo<float, IndexType>& index, IndexType* indexOffset, + const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) { for (int d = Dims - 1; d >= 0; d--) { IndexType curDimIndex = linearId % index.sizes[d]; *indexOffset += curDimIndex * index.strides[d]; @@ -45,9 +44,9 @@ template <typename IndexType> struct IndexToScatterGatherOffsets<IndexType, -1> { static __device__ void compute( IndexType linearId, const int dim, - const TensorInfo<IndexType>& index, IndexType* indexOffset, - const TensorInfo<IndexType>& t1, IndexType* t1Offset, - const TensorInfo<IndexType>& t2, IndexType* t2Offset) { + const TensorInfo<float, IndexType>& index, IndexType* indexOffset, + const TensorInfo<float, IndexType>& t1, IndexType* t1Offset, + const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) { for (int d = index.dims - 1; d >= 0; d--) { IndexType curDimIndex = linearId % index.sizes[d]; *indexOffset += curDimIndex * index.strides[d]; @@ -61,8 +60,8 @@ struct IndexToScatterGatherOffsets<IndexType, -1> { static __device__ void compute( IndexType linearId, const int dim, - const TensorInfo<IndexType>& index, IndexType* indexOffset, - const TensorInfo<IndexType>& t2, IndexType* t2Offset) { + const TensorInfo<float, IndexType>& index, IndexType* indexOffset, + const TensorInfo<float, IndexType>& t2, IndexType* t2Offset) { for (int d = index.dims - 1; d >= 0; d--) { IndexType curDimIndex = linearId % index.sizes[d]; *indexOffset += curDimIndex * index.strides[d]; @@ -77,9 +76,9 @@ struct IndexToScatterGatherOffsets<IndexType, -1> { template <typename IndexType, int Dims> __global__ void THCudaTensor_gatherKernel( - TensorInfo<IndexType> tensor, - TensorInfo<IndexType> src, - TensorInfo<IndexType> index, + TensorInfo<float, IndexType> tensor, + TensorInfo<float, IndexType> src, + TensorInfo<float, IndexType> index, const int dim, const IndexType totalElements) { for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; @@ -135,17 +134,20 @@ void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *sr THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING); THCudaTensor* oldTensor = NULL; - if (THC_overlappingIndices(state, tensor)) { + if (TensorUtils<THCudaTensor>::overlappingIndices(state, tensor)) { oldTensor = tensor; tensor = THCudaTensor_newContiguous(state, tensor); } - if (THC_canUse32BitIndexMath(state, tensor) && - THC_canUse32BitIndexMath(state, src) && - THC_canUse32BitIndexMath(state, index)) { - TensorInfo<unsigned int> tensorInfo(state, tensor); - TensorInfo<unsigned int> srcInfo(state, src); - TensorInfo<unsigned int> indexInfo(state, index); + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, tensor) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, index)) { + TensorInfo<float, unsigned int> tensorInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, tensor); + TensorInfo<float, unsigned int> srcInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, src); + TensorInfo<float, unsigned int> indexInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, index); // Specialize for a small number of dimensions. switch (indexInfo.dims) { @@ -163,15 +165,18 @@ void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *sr break; } } else { - TensorInfo<unsigned long> tensorInfo(state, tensor); - TensorInfo<unsigned long> srcInfo(state, src); - TensorInfo<unsigned long> indexInfo(state, index); + TensorInfo<float, unsigned long> tensorInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, tensor); + TensorInfo<float, unsigned long> srcInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, src); + TensorInfo<float, unsigned long> indexInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, index); RUN(unsigned long, -1) } if (oldTensor) { - THCudaTensor_copyIgnoringOverlaps(state, oldTensor, tensor); + TensorUtils<THCudaTensor>::copyIgnoringOverlaps(state, oldTensor, tensor); THCudaTensor_free(state, tensor); tensor = oldTensor; } @@ -182,9 +187,9 @@ void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *sr template <typename IndexType, int Dims> __global__ void THCudaTensor_scatterKernel( - TensorInfo<IndexType> tensor, - TensorInfo<IndexType> src, - TensorInfo<IndexType> index, + TensorInfo<float, IndexType> tensor, + TensorInfo<float, IndexType> src, + TensorInfo<float, IndexType> index, const int dim, const IndexType totalElements) { for (IndexType linearId = blockIdx.x * blockDim.x + threadIdx.x; @@ -230,7 +235,7 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda } } - THArgCheck(THCudaTensor_nDimension(state, tensor) <= MAX_CUTORCH_DIMS, + THArgCheck(THCudaTensor_nDimension(state, tensor) <= MAX_CUTORCH_DIMS, 1, CUTORCH_DIM_WARNING); const long totalElements = THCudaTensor_nElement(state, index); @@ -239,17 +244,20 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING); THCudaTensor* oldTensor = NULL; - if (THC_overlappingIndices(state, tensor)) { + if (TensorUtils<THCudaTensor>::overlappingIndices(state, tensor)) { oldTensor = tensor; tensor = THCudaTensor_newContiguous(state, tensor); } - if (THC_canUse32BitIndexMath(state, tensor) && - THC_canUse32BitIndexMath(state, src) && - THC_canUse32BitIndexMath(state, index)) { - TensorInfo<unsigned int> tensorInfo(state, tensor); - TensorInfo<unsigned int> srcInfo(state, src); - TensorInfo<unsigned int> indexInfo(state, index); + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, tensor) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, src) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, index)) { + TensorInfo<float, unsigned int> tensorInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, tensor); + TensorInfo<float, unsigned int> srcInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, src); + TensorInfo<float, unsigned int> indexInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, index); // Specialize for a small number of dimensions. switch (indexInfo.dims) { @@ -267,15 +275,18 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda break; } } else { - TensorInfo<unsigned long> tensorInfo(state, tensor); - TensorInfo<unsigned long> srcInfo(state, src); - TensorInfo<unsigned long> indexInfo(state, index); + TensorInfo<float, unsigned long> tensorInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, tensor); + TensorInfo<float, unsigned long> srcInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, src); + TensorInfo<float, unsigned long> indexInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, index); RUN(unsigned long, -1) } if (oldTensor) { - THCudaTensor_copyIgnoringOverlaps(state, oldTensor, tensor); + TensorUtils<THCudaTensor>::copyIgnoringOverlaps(state, oldTensor, tensor); THCudaTensor_free(state, tensor); tensor = oldTensor; } @@ -286,8 +297,8 @@ void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCuda template <typename IndexType, int Dims> __global__ void THCudaTensor_scatterFillKernel( - TensorInfo<IndexType> tensor, - TensorInfo<IndexType> index, + TensorInfo<float, IndexType> tensor, + TensorInfo<float, IndexType> index, float value, const int dim, const IndexType totalElements) { @@ -313,17 +324,21 @@ __global__ void THCudaTensor_scatterFillKernel( <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ tensorInfo, indexInfo, value, dim, (TYPE)totalElements); -void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, THCudaTensor *index, float value) { +void +THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, + int dim, THCudaTensor *index, float value) { THAssert(THCudaTensor_checkGPU(state, 2, tensor, index)); THArgCheck(dim >= 0 && dim < THCudaTensor_nDimension(state, tensor), 2, "Index dimension is out of bounds"); - THArgCheck(THCudaTensor_nDimension(state, index) == THCudaTensor_nDimension(state, tensor), 3, + THArgCheck(THCudaTensor_nDimension(state, index) == + THCudaTensor_nDimension(state, tensor), 3, "Index tensor must have same dimensions as output tensor"); for (int d = 0; d < THCudaTensor_nDimension(state, tensor); d++) { if (d != dim) { - THArgCheck(THCudaTensor_size(state, tensor, d) == THCudaTensor_size(state, index, d), 4, + THArgCheck(THCudaTensor_size(state, tensor, d) == + THCudaTensor_size(state, index, d), 4, "Index tensor must have same size as output tensor apart from the specified dimension"); } } @@ -337,15 +352,17 @@ void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, TH THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING); THCudaTensor* oldTensor = NULL; - if (THC_overlappingIndices(state, tensor)) { + if (TensorUtils<THCudaTensor>::overlappingIndices(state, tensor)) { oldTensor = tensor; tensor = THCudaTensor_newContiguous(state, tensor); } - if (THC_canUse32BitIndexMath(state, tensor) && - THC_canUse32BitIndexMath(state, index)) { - TensorInfo<unsigned int> tensorInfo(state, tensor); - TensorInfo<unsigned int> indexInfo(state, index); + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, tensor) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, index)) { + TensorInfo<float, unsigned int> tensorInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, tensor); + TensorInfo<float, unsigned int> indexInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, index); // Specialize for a small number of dimensions. switch (indexInfo.dims) { @@ -363,14 +380,16 @@ void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, TH break; } } else { - TensorInfo<unsigned long> tensorInfo(state, tensor); - TensorInfo<unsigned long> indexInfo(state, index); + TensorInfo<float, unsigned long> tensorInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, tensor); + TensorInfo<float, unsigned long> indexInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, index); RUN(unsigned long, -1); } if (oldTensor) { - THCudaTensor_copyIgnoringOverlaps(state, oldTensor, tensor); + TensorUtils<THCudaTensor>::copyIgnoringOverlaps(state, oldTensor, tensor); THCudaTensor_free(state, tensor); tensor = oldTensor; } diff --git a/lib/THC/THCTensorSort.cu b/lib/THC/THCTensorSort.cu index c35a36a..3b9562d 100644 --- a/lib/THC/THCTensorSort.cu +++ b/lib/THC/THCTensorSort.cu @@ -1,6 +1,7 @@ #include "THCReduceApplyUtils.cuh" #include "THCSortUtils.cuh" #include "THCTensorCopy.h" +#include "THCTensorTypeUtils.cuh" #include <thrust/device_ptr.h> #include <thrust/sort.h> @@ -28,7 +29,7 @@ unsigned long nextHighestPowerOf2(unsigned long n) { // `sliceSize - 1`. template <typename IndexType, int Dim> __global__ void -fillSliceWithIndex(TensorInfo<IndexType> out, +fillSliceWithIndex(TensorInfo<float, IndexType> out, IndexType totalSlices, IndexType sliceSize, IndexType sliceStride) { @@ -39,7 +40,7 @@ fillSliceWithIndex(TensorInfo<IndexType> out, } const unsigned long offset = - IndexToOffset<IndexType, Dim>::get(slice, out); + IndexToOffset<float, IndexType, Dim>::get(slice, out); float* base = &out.data[offset]; for (long i = threadIdx.x; i < sliceSize; i += blockDim.x) { @@ -76,9 +77,10 @@ void THCudaTensor_fillSliceWithIndex(THCState* state, <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ info, numSlices, sliceSize, info.strides[collapseDim]) - if (THC_canUse32BitIndexMath(state, t)) { - TensorInfo<unsigned int> info(state, t, dim); - info.sizes[dim] = 1; + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, t)) { + TensorInfo<float, unsigned int> info = + getTensorInfo<THCudaTensor, unsigned int>(state, t); + info.reduceDim(dim); int collapseDim = info.collapseDims(dim); if (info.isContiguous()) { @@ -93,8 +95,9 @@ void THCudaTensor_fillSliceWithIndex(THCState* state, } } } else { - TensorInfo<unsigned long> info(state, t, dim); - info.sizes[dim] = 1; + TensorInfo<float, unsigned long> info = + getTensorInfo<THCudaTensor, unsigned long>(state, t); + info.reduceDim(dim); int collapseDim = info.collapseDims(dim); // catch-all implementation @@ -221,13 +224,15 @@ THC_API void THCudaTensor_sortKeyValueInplace(THCState* state, // The constructed key/value tensor info is used to select the slice // we are sorting on a per-block basis - if (THC_canUse32BitIndexMath(state, key)) { - TensorInfo<unsigned int> keyInfo(state, key); - keyInfo.sizes[dim] = 1; + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, key)) { + TensorInfo<float, unsigned int> keyInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, key); + keyInfo.reduceDim(dim); int collapseKeyDim = keyInfo.collapseDims(dim); - TensorInfo<unsigned int> valueInfo(state, value); - valueInfo.sizes[dim] = 1; + TensorInfo<float, unsigned int> valueInfo = + getTensorInfo<THCudaTensor, unsigned int>(state, value); + valueInfo.reduceDim(dim); int collapseValueDim = valueInfo.collapseDims(dim); if (keyInfo.isContiguous()) { @@ -246,12 +251,14 @@ THC_API void THCudaTensor_sortKeyValueInplace(THCState* state, } } } else { - TensorInfo<unsigned long> keyInfo(state, key); - keyInfo.sizes[dim] = 1; + TensorInfo<float, unsigned long> keyInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, key); + keyInfo.reduceDim(dim); int collapseKeyDim = keyInfo.collapseDims(dim); - TensorInfo<unsigned long> valueInfo(state, value); - valueInfo.sizes[dim] = 1; + TensorInfo<float, unsigned long> valueInfo = + getTensorInfo<THCudaTensor, unsigned long>(state, value); + valueInfo.reduceDim(dim); int collapseValueDim = valueInfo.collapseDims(dim); // long case is rare, just instantiate these versions diff --git a/lib/THC/THCTensorTopK.cu b/lib/THC/THCTensorTopK.cu index a8c189f..582143e 100644 --- a/lib/THC/THCTensorTopK.cu +++ b/lib/THC/THCTensorTopK.cu @@ -3,6 +3,7 @@ #include "THCTensorSort.h" #include "THCAsmUtils.cuh" #include "THCScanUtils.cuh" +#include "THCTensorTypeUtils.cuh" #include <algorithm> // for std::min #if CUDA_VERSION >= 7000 @@ -248,18 +249,18 @@ __device__ void radixSelect(const RadixConverter& conv, } template <typename IndexType, int Dim, bool Order> -__global__ void gatherTopK(TensorInfo<IndexType> input, +__global__ void gatherTopK(TensorInfo<float, IndexType> input, IndexType inputSliceSize, IndexType outputSliceSize, // aka `k` IndexType numInputSlices, IndexType inputWithinSliceStride, - TensorInfo<IndexType> topK, + TensorInfo<float, IndexType> topK, IndexType numTopKSlices, IndexType topKWithinSliceStride, - TensorInfo<IndexType> indices, + TensorInfo<float, IndexType> indices, IndexType indicesWithinSliceStride) { // Indices are limited to integer fp precision, so counts can fit in // int32, regardless of IndexType @@ -272,11 +273,11 @@ __global__ void gatherTopK(TensorInfo<IndexType> input, // Find the start offset for our slice IndexType sliceStartIndex = - IndexToOffset<IndexType, Dim>::get(slice, input); + IndexToOffset<float, IndexType, Dim>::get(slice, input); IndexType topKSliceStartIndex = - IndexToOffset<IndexType, Dim>::get(slice, topK); + IndexToOffset<float, IndexType, Dim>::get(slice, topK); IndexType indicesSliceStartIndex = - IndexToOffset<IndexType, Dim>::get(slice, indices); + IndexToOffset<float, IndexType, Dim>::get(slice, indices); float* inputSliceStart = &input.data[sliceStartIndex]; float* topKSliceStart = &topK.data[topKSliceStartIndex]; @@ -414,8 +415,8 @@ THC_API void THCudaTensor_topk(THCState* state, THCudaTensor_resize(state, indices, topKSize, NULL); THLongStorage_free(topKSize); -#define RUN_K(T, DIM, DIR) \ - gatherTopK<T, DIM, DIR> \ +#define RUN_K(INDEX_T, DIM, DIR) \ + gatherTopK<INDEX_T, DIM, DIR> \ <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ inputInfo, \ sliceSize, \ @@ -430,70 +431,73 @@ THC_API void THCudaTensor_topk(THCState* state, indicesInfo, \ indicesInfo.strides[collapseIndicesDim]) -#define RUN_DIR(T, DIM) \ +#define RUN_DIR(INDEX_T, DIM) \ if (dir) { \ - RUN_K(T, DIM, true); \ + RUN_K(INDEX_T, DIM, true); \ } else { \ - RUN_K(T, DIM, false); \ + RUN_K(INDEX_T, DIM, false); \ } -#define RUN_DIM(T) \ +#define RUN_DIM(INDEX_T) \ if (allDims == 1) { \ - RUN_DIR(T, 1); \ + RUN_DIR(INDEX_T, 1); \ } else if (allDims == 2) { \ - RUN_DIR(T, 2); \ + RUN_DIR(INDEX_T, 2); \ } else if (allDims == 3) { \ - RUN_DIR(T, 3); \ + RUN_DIR(INDEX_T, 3); \ } else { \ - RUN_DIR(T, -1); \ + RUN_DIR(INDEX_T, -1); \ } -#define RUN_T(T) \ - TensorInfo<T> inputInfo(state, input); \ - TensorInfo<T> topKInfo(state, topK); \ - TensorInfo<T> indicesInfo(state, indices); \ +#define RUN_T(INDEX_T) \ + TensorInfo<float, INDEX_T> inputInfo = \ + getTensorInfo<THCudaTensor, INDEX_T>(state, input); \ + TensorInfo<float, INDEX_T> topKInfo = \ + getTensorInfo<THCudaTensor, INDEX_T>(state, topK); \ + TensorInfo<float, INDEX_T> indicesInfo = \ + getTensorInfo<THCudaTensor, INDEX_T>(state, indices); \ \ - /* We use these structures solely to find the offset to */ \ - /* each slice we are operating on */ \ - inputInfo.sizes[dim] = 1; \ - topKInfo.sizes[dim] = 1; \ - indicesInfo.sizes[dim] = 1; \ + /* We use these structures solely to find the offset to */ \ + /* each slice we are operating on */ \ + inputInfo.sizes[dim] = 1; \ + topKInfo.sizes[dim] = 1; \ + indicesInfo.sizes[dim] = 1; \ \ - /* Collapse all other dims */ \ - int collapseInputDim = inputInfo.collapseDims(dim); \ - int collapseTopKDim = topKInfo.collapseDims(dim); \ - int collapseIndicesDim = indicesInfo.collapseDims(dim); \ + /* Collapse all other dims */ \ + int collapseInputDim = inputInfo.collapseDims(dim); \ + int collapseTopKDim = topKInfo.collapseDims(dim); \ + int collapseIndicesDim = indicesInfo.collapseDims(dim); \ \ - long inputSlices = 1; \ - long topKSlices = 1; \ - for (int i = 0; i < numDims; ++i) { \ - inputSlices *= inputInfo.sizes[i]; \ - topKSlices *= topKInfo.sizes[i]; \ - } \ + long inputSlices = 1; \ + long topKSlices = 1; \ + for (int i = 0; i < numDims; ++i) { \ + inputSlices *= inputInfo.sizes[i]; \ + topKSlices *= topKInfo.sizes[i]; \ + } \ \ - dim3 grid; \ - if (!THC_getGridFromTiles(inputSlices, grid)) { \ - THError("Slice to sort is too large"); \ - } \ + dim3 grid; \ + if (!THC_getGridFromTiles(inputSlices, grid)) { \ + THError("Slice to sort is too large"); \ + } \ \ - dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L)); \ + dim3 block(std::min(THCRoundUp(sliceSize, 32L), 1024L)); \ \ - /* This is used as a template parameter to calculate indices. */ \ - /* We only specialize it if all collapsed dim sizes are the */ \ - /* same; otherwise, we use -1 which is the specialization */ \ - /* parameter for arbitrary dimensions */ \ - int allDims = inputInfo.dims; \ - if (topKInfo.dims != allDims || indicesInfo.dims != allDims) { \ - allDims = -1; \ - } \ + /* This is used as a template parameter to calculate indices. */ \ + /* We only specialize it if all collapsed dim sizes are the */ \ + /* same; otherwise, we use -1 which is the specialization */ \ + /* parameter for arbitrary dimensions */ \ + int allDims = inputInfo.dims; \ + if (topKInfo.dims != allDims || indicesInfo.dims != allDims) { \ + allDims = -1; \ + } \ \ - RUN_DIM(T); + RUN_DIM(INDEX_T); // Based on required index size, run the algorithm with the // appropriate index type - if (THC_canUse32BitIndexMath(state, input) && - THC_canUse32BitIndexMath(state, topK) && - THC_canUse32BitIndexMath(state, indices)) { + if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, topK) && + TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, indices)) { RUN_T(unsigned int); } else { RUN_T(unsigned long); diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu new file mode 100644 index 0000000..fc9bd60 --- /dev/null +++ b/lib/THC/THCTensorTypeUtils.cu @@ -0,0 +1,210 @@ +#include "THCTensorTypeUtils.cuh" +#include "THCTensor.h" +#include "THCTensorCopy.h" +#include "THCHalf.h" +#include <stdlib.h> + +namespace { + +struct SizeAndStride { + long size; + long stride; +}; + +int compareSizeAndStride(const void* a, const void* b) { + const SizeAndStride* aS = (const SizeAndStride*) a; + const SizeAndStride* bS = (const SizeAndStride*) b; + + return aS->stride < bS->stride; +} + +} + +#define IMPL_TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE) \ + \ +TENSOR_TYPE* \ +TensorUtils<TENSOR_TYPE>::newTensor(THCState* state) { \ + return TENSOR_TYPE##_new(state); \ +} \ + \ +TENSOR_TYPE* \ +TensorUtils<TENSOR_TYPE>::newContiguous(THCState* state, \ + TENSOR_TYPE* t) { \ + return TENSOR_TYPE##_newContiguous(state, t); \ +} \ + \ +void \ +TensorUtils<TENSOR_TYPE>::retain(THCState* state, \ + TENSOR_TYPE* t) { \ + TENSOR_TYPE##_retain(state, t); \ +} \ + \ +void \ +TensorUtils<TENSOR_TYPE>::free(THCState* state, \ + TENSOR_TYPE* t) { \ + TENSOR_TYPE##_free(state, t); \ +} \ + \ +void \ +TensorUtils<TENSOR_TYPE>::freeCopyTo(THCState* state, \ + TENSOR_TYPE* src, \ + TENSOR_TYPE* dst) { \ + TENSOR_TYPE##_freeCopyTo(state, src, dst); \ +} \ + \ +void \ +TensorUtils<TENSOR_TYPE>::resizeAs(THCState* state, \ + TENSOR_TYPE* dst, \ + TENSOR_TYPE* src) { \ + TENSOR_TYPE##_resizeAs(state, dst, src); \ +} \ + \ +DATA_TYPE* \ +TensorUtils<TENSOR_TYPE>::getData(THCState* state, \ + TENSOR_TYPE* t) { \ + /* FIXME: no cast is required except for THCudaHalfTensor */ \ + return (DATA_TYPE*) TENSOR_TYPE##_data(state, t); \ +} \ + \ +long \ +TensorUtils<TENSOR_TYPE>::getNumElements(THCState* state, \ + TENSOR_TYPE* t) { \ + return TENSOR_TYPE##_nElement(state, t); \ +} \ + \ +long \ +TensorUtils<TENSOR_TYPE>::getSize(THCState* state, \ + TENSOR_TYPE* t, \ + int dim) { \ + return TENSOR_TYPE##_size(state, t, dim); \ +} \ + \ +long \ +TensorUtils<TENSOR_TYPE>::getStride(THCState* state, \ + TENSOR_TYPE* t, \ + int dim) { \ + return TENSOR_TYPE##_stride(state, t, dim); \ +} \ + \ +int \ +TensorUtils<TENSOR_TYPE>::getDims(THCState* state, \ + TENSOR_TYPE* t) { \ + return TENSOR_TYPE##_nDimension(state, t); \ +} \ + \ +bool \ +TensorUtils<TENSOR_TYPE>::isContiguous(THCState* state, \ + TENSOR_TYPE* t) { \ + return TENSOR_TYPE##_isContiguous(state, t); \ +} \ + \ +int \ +TensorUtils<TENSOR_TYPE>::getDevice(THCState* state, \ + TENSOR_TYPE* t) { \ + return TENSOR_TYPE##_getDevice(state, t); \ +} \ + \ +void \ +TensorUtils<TENSOR_TYPE>::copyIgnoringOverlaps(THCState* state, \ + TENSOR_TYPE* dst, \ + TENSOR_TYPE* src) { \ + return TENSOR_TYPE##_copyIgnoringOverlaps(state, dst, src); \ +} \ + \ +bool \ +TensorUtils<TENSOR_TYPE>::overlappingIndices(THCState* state, \ + TENSOR_TYPE* t) { \ + /* In this function, we don't care about permutations of the */ \ + /* size/stride arrays (transpositions). */ \ + /* We order the size/stride arrays by stride, skipping dimensions of */ \ + /* size 1. Strides of dimensions of size 1 don't matter, since there */ \ + /* is only one addressing point in them. */ \ + /* In this reordered view, the tensor is contiguous if */ \ + /* stride[dim] == size[dim + 1] * stride[dim + 1] for all `dim`. */ \ + /* The tensor has holes if */ \ + /* stride[dim] > size[dim + 1] * stride[dim + 1] for one or more */ \ + /* `dim`. */ \ + /* The tensor has overlaps if */ \ + /* stride[dim] < size[dim + 1] * stride[dim + 1] for one or more */ \ + /* `dim`, or the innermost stride is 0. */ \ + \ + /* Extract size/stride arrays; only consider size >1 dims. */ \ + SizeAndStride info[MAX_CUTORCH_DIMS]; \ + \ + int dims = TensorUtils<TENSOR_TYPE>::getDims(state, t); \ + int nonSize1Dims = 0; \ + for (int i = 0; i < dims; ++i) { \ + long size = TensorUtils<TENSOR_TYPE>::getSize(state, t, i); \ + if (size > 1) { \ + info[nonSize1Dims].size = size; \ + info[nonSize1Dims].stride = \ + TensorUtils<TENSOR_TYPE>::getStride(state, t, i); \ + ++nonSize1Dims; \ + } \ + } \ + \ + if (nonSize1Dims == 0) { \ + /* no overlap */ \ + return false; \ + } \ + \ + /* Ascending order (innermost dimension in sorted view is at [0]) */ \ + qsort(info, nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride); \ + \ + /* Base case: innermost dimension must have stride >= 1 */ \ + if (info[nonSize1Dims - 1].stride < 1) { \ + return true; \ + } \ + \ + /* Subsequent dimensions, if any */ \ + for (int i = nonSize1Dims - 2; i >= 0; --i) { \ + if (info[i].stride < info[i + 1].size * info[i + 1].stride) { \ + /* There are overlaps */ \ + return true; \ + } \ + } \ + \ + /* Tensor has holes or is contiguous */ \ + return false; \ +} \ + \ +bool \ +TensorUtils<TENSOR_TYPE>::canUse32BitIndexMath(THCState* state, \ + TENSOR_TYPE* t) { \ + long elements = TensorUtils<TENSOR_TYPE>::getNumElements(state, t); \ + if (elements >= UINT_MAX) { \ + return false; \ + } \ + \ + long offset = 0; \ + long linearId = elements - 1; \ + \ + for (int i = TensorUtils<TENSOR_TYPE>::getDims(state, t) - 1; i >= 0; --i) { \ + long curDimIndex = \ + linearId % TensorUtils<TENSOR_TYPE>::getSize(state, t, i); \ + long curDimOffset = curDimIndex * \ + TensorUtils<TENSOR_TYPE>::getStride(state, t, i); \ + offset += curDimOffset; \ + linearId /= TensorUtils<TENSOR_TYPE>::getSize(state, t, i); \ + } \ + \ + if (offset >= UINT_MAX) { \ + return false; \ + } \ + \ + return true; \ +} + +IMPL_TENSOR_UTILS(THCudaByteTensor, unsigned char) +IMPL_TENSOR_UTILS(THCudaCharTensor, char) +IMPL_TENSOR_UTILS(THCudaShortTensor, short) +IMPL_TENSOR_UTILS(THCudaIntTensor, int) +IMPL_TENSOR_UTILS(THCudaLongTensor, long) +IMPL_TENSOR_UTILS(THCudaTensor, float) +IMPL_TENSOR_UTILS(THCudaDoubleTensor, double) + +#ifdef CUDA_HALF_TENSOR +IMPL_TENSOR_UTILS(THCudaHalfTensor, half); +#endif + +#undef IMPL_TENSOR_UTILS diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh new file mode 100644 index 0000000..4456f47 --- /dev/null +++ b/lib/THC/THCTensorTypeUtils.cuh @@ -0,0 +1,184 @@ +#ifndef THC_TENSOR_TYPE_UTILS_INC +#define THC_TENSOR_TYPE_UTILS_INC + +#include <cuda.h> +#include <assert.h> +#include "THCGeneral.h" +#include "THCHalf.h" +#include "THCTensor.h" +#include "THCTensorInfo.cuh" + +/// A utility for accessing THCuda*Tensor types in a generic manner + +/// Equivalent to C++11's type_traits std::is_same; used for comparing +/// equality of types. Don't assume the existence of C++11 +template <typename T, typename U> +struct SameType { + static const bool same = false; +}; + +template <typename T> +struct SameType<T, T> { + static const bool same = true; +}; + +template <typename T, typename U> +bool isSameType() { + return SameType<T, U>::same; +} + +template <typename TensorType> +struct TensorUtils { +}; + +#define TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE) \ + template <> \ + struct TensorUtils<TENSOR_TYPE> { \ + typedef DATA_TYPE DataType; \ + \ + static TENSOR_TYPE* newTensor(THCState* state); \ + static TENSOR_TYPE* newContiguous(THCState* state, TENSOR_TYPE* t); \ + static void retain(THCState* state, TENSOR_TYPE* t); \ + static void free(THCState* state, TENSOR_TYPE* t); \ + static void freeCopyTo(THCState* state, TENSOR_TYPE* src, \ + TENSOR_TYPE* dst); \ + static void resizeAs(THCState* state, TENSOR_TYPE* dst, \ + TENSOR_TYPE* src); \ + static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t); \ + static long getNumElements(THCState* state, TENSOR_TYPE* t); \ + static long getSize(THCState* state, TENSOR_TYPE* t, int dim); \ + static long getStride(THCState* state, TENSOR_TYPE* t, int dim); \ + static int getDims(THCState* state, TENSOR_TYPE* t); \ + static bool isContiguous(THCState* state, TENSOR_TYPE* t); \ + static int getDevice(THCState* state, TENSOR_TYPE* t); \ + static void copyIgnoringOverlaps(THCState* state, \ + TENSOR_TYPE* dst, TENSOR_TYPE* src); \ + /* Determines if the given tensor has overlapping data points (i.e., */ \ + /* is there more than one index into the tensor that references */ \ + /* the same piece of data)? */ \ + static bool overlappingIndices(THCState* state, TENSOR_TYPE* t); \ + /* Can we use 32 bit math for indexing? */ \ + static bool canUse32BitIndexMath(THCState* state, TENSOR_TYPE* t); \ + } + +TENSOR_UTILS(THCudaByteTensor, unsigned char); +TENSOR_UTILS(THCudaCharTensor, char); +TENSOR_UTILS(THCudaShortTensor, short); +TENSOR_UTILS(THCudaIntTensor, int); +TENSOR_UTILS(THCudaLongTensor, long); +TENSOR_UTILS(THCudaTensor, float); +TENSOR_UTILS(THCudaDoubleTensor, double); + +#ifdef CUDA_HALF_TENSOR +TENSOR_UTILS(THCudaHalfTensor, half); +#endif + +#undef TENSOR_UTILS + +template <typename TensorType, typename IndexType> +TensorInfo<typename TensorUtils<TensorType>::DataType, IndexType> +getTensorInfo(THCState* state, TensorType* t) { + IndexType sz[MAX_CUTORCH_DIMS]; + IndexType st[MAX_CUTORCH_DIMS]; + + int dims = TensorUtils<TensorType>::getDims(state, t); + for (int i = 0; i < dims; ++i) { + sz[i] = TensorUtils<TensorType>::getSize(state, t, i); + st[i] = TensorUtils<TensorType>::getStride(state, t, i); + } + + return TensorInfo<typename TensorUtils<TensorType>::DataType, IndexType>( + TensorUtils<TensorType>::getData(state, t), dims, sz, st); +} + +/// `half` has some type conversion issues associated with it, since it +/// is a struct without a constructor/implicit conversion constructor. +/// We use this to convert scalar values to the given type that the +/// tensor expects. +template <typename In, typename Out> +struct ScalarConvert { + static __host__ __device__ Out to(const In v) { return (Out) v; } +}; + +template <typename T> +struct ScalarNegate { + static __host__ __device__ T to(const T v) { return -v; } +}; + +template <typename T> +struct ScalarInv { + static __host__ __device__ T to(const T v) { return ((T) 1) / v; } +}; + +#ifdef CUDA_HALF_TENSOR + +template <typename Out> +struct ScalarConvert<half, Out> { + static __host__ __device__ Out to(const half v) { +#ifdef __CUDA_ARCH__ + return (Out) __half2float(v); +#else + return (Out) THC_half2float(v); +#endif + } +}; + +template <typename In> +struct ScalarConvert<In, half> { + static __host__ __device__ half to(const In v) { +#ifdef __CUDA_ARCH__ + return __float2half((float) v); +#else + return THC_float2half((float) v); +#endif + } +}; + +template <> +struct ScalarConvert<half, half> { + static __host__ __device__ half to(const half v) { + return v; + } +}; + +template <> +struct ScalarNegate<half> { + static __host__ __device__ half to(const half v) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hneg(v); +#else + return __float2half(-__half2float(v)); +#endif +#else + half out = v; + out.x ^= 0x8000; // toggle sign bit + return out; +#endif + } +}; + +template <> +struct ScalarInv<half> { + static __host__ __device__ half to(const half v) { +#ifdef __CUDA_ARCH__ + return __float2half(1.0f / __half2float(v)); +#else + float fv = THC_half2float(v); + fv = 1.0f / fv; + return THC_float2half(fv); +#endif + } +}; + +inline bool operator==(half a, half b) { + return a.x == b.x; +} + +inline bool operator!=(half a, half b) { + return a.x != b.x; +} + +#endif // CUDA_HALF_TENSOR + +#endif // THC_TENSOR_TYPE_UTILS_INC diff --git a/lib/THC/generic/THCStorage.c b/lib/THC/generic/THCStorage.c index d8fec77..61ba125 100644 --- a/lib/THC/generic/THCStorage.c +++ b/lib/THC/generic/THCStorage.c @@ -17,29 +17,18 @@ int THCStorage_(elementSize)(THCState *state) return sizeof(real); } -void THCStorage_(set)(THCState *state, THCStorage *self, long index, hostreal _value) +void THCStorage_(set)(THCState *state, THCStorage *self, long index, real value) { THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); - real value = hostrealToReal(_value); THCudaCheck(cudaMemcpy(self->data + index, &value, sizeof(real), cudaMemcpyHostToDevice)); } -hostreal THCStorage_(get)(THCState *state, const THCStorage *self, long index) +real THCStorage_(get)(THCState *state, const THCStorage *self, long index) { THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); -#ifndef THC_REAL_IS_HALF real value; THCudaCheck(cudaMemcpy(&value, self->data + index, sizeof(real), cudaMemcpyDeviceToHost)); - return realToHostreal(value); -#else - float *ret_d; - float ret; - THCudaCheck(THCudaMalloc(state, (void**)&ret_d, sizeof(float))); - THCHalf2Float(state, ret_d, self->data + index, 1); - THCudaCheck(cudaMemcpy(&ret, ret_d, sizeof(float), cudaMemcpyDeviceToHost)); - THCudaFree(state, ret_d); - return ret; -#endif + return value; } THCStorage* THCStorage_(new)(THCState *state) @@ -80,14 +69,14 @@ THCStorage* THCStorage_(newWithSize)(THCState *state, long size) } } -THCStorage* THCStorage_(newWithSize1)(THCState *state, hostreal data0) +THCStorage* THCStorage_(newWithSize1)(THCState *state, real data0) { THCStorage *self = THCStorage_(newWithSize)(state, 1); THCStorage_(set)(state, self, 0, data0); return self; } -THCStorage* THCStorage_(newWithSize2)(THCState *state, hostreal data0, hostreal data1) +THCStorage* THCStorage_(newWithSize2)(THCState *state, real data0, real data1) { THCStorage *self = THCStorage_(newWithSize)(state, 2); THCStorage_(set)(state, self, 0, data0); @@ -95,7 +84,7 @@ THCStorage* THCStorage_(newWithSize2)(THCState *state, hostreal data0, hostreal return self; } -THCStorage* THCStorage_(newWithSize3)(THCState *state, hostreal data0, hostreal data1, hostreal data2) +THCStorage* THCStorage_(newWithSize3)(THCState *state, real data0, real data1, real data2) { THCStorage *self = THCStorage_(newWithSize)(state, 3); THCStorage_(set)(state, self, 0, data0); @@ -104,7 +93,7 @@ THCStorage* THCStorage_(newWithSize3)(THCState *state, hostreal data0, hostreal return self; } -THCStorage* THCStorage_(newWithSize4)(THCState *state, hostreal data0, hostreal data1, hostreal data2, hostreal data3) +THCStorage* THCStorage_(newWithSize4)(THCState *state, real data0, real data1, real data2, real data3) { THCStorage *self = THCStorage_(newWithSize)(state, 4); THCStorage_(set)(state, self, 0, data0); diff --git a/lib/THC/generic/THCStorage.cu b/lib/THC/generic/THCStorage.cu index ff0d3c9..17924f5 100644 --- a/lib/THC/generic/THCStorage.cu +++ b/lib/THC/generic/THCStorage.cu @@ -2,10 +2,9 @@ #define THC_GENERIC_FILE "generic/THCStorage.cu" #else -void THCStorage_(fill)(THCState *state, THCStorage *self, hostreal _value) +void THCStorage_(fill)(THCState *state, THCStorage *self, real value) { thrust::device_ptr<real> self_data(self->data); - real value = hostrealToReal(_value); thrust::fill( #if CUDA_VERSION >= 7000 thrust::cuda::par.on(THCState_getCurrentStream(state)), diff --git a/lib/THC/generic/THCStorage.h b/lib/THC/generic/THCStorage.h index f161d5f..a8c5f5f 100644 --- a/lib/THC/generic/THCStorage.h +++ b/lib/THC/generic/THCStorage.h @@ -23,15 +23,15 @@ THC_API long THCStorage_(size)(THCState *state, const THCStorage*); THC_API int THCStorage_(elementSize)(THCState *state); /* slow access -- checks everything */ -THC_API void THCStorage_(set)(THCState *state, THCStorage*, long, hostreal); -THC_API hostreal THCStorage_(get)(THCState *state, const THCStorage*, long); +THC_API void THCStorage_(set)(THCState *state, THCStorage*, long, real); +THC_API real THCStorage_(get)(THCState *state, const THCStorage*, long); THC_API THCStorage* THCStorage_(new)(THCState *state); THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, long size); -THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, hostreal); -THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, hostreal, hostreal); -THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, hostreal, hostreal, hostreal); -THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, hostreal, hostreal, hostreal, hostreal); +THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, real); +THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, real, real); +THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, real, real, real); +THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, real, real, real, real); THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, long size, int shared); /* takes ownership of data */ @@ -49,6 +49,6 @@ THC_API void THCStorage_(retain)(THCState *state, THCStorage *storage); THC_API void THCStorage_(free)(THCState *state, THCStorage *storage); THC_API void THCStorage_(resize)(THCState *state, THCStorage *storage, long size); -THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, hostreal value); +THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, real value); #endif diff --git a/lib/THC/generic/THCStorageCopy.cu b/lib/THC/generic/THCStorageCopy.cu index b00c122..298f717 100644 --- a/lib/THC/generic/THCStorageCopy.cu +++ b/lib/THC/generic/THCStorageCopy.cu @@ -62,7 +62,7 @@ THC_CUDA_STORAGE_IMPLEMENT_COPY(Long,Long) THC_CUDA_STORAGE_IMPLEMENT_COPY(Float,) // i.e. float THC_CUDA_STORAGE_IMPLEMENT_COPY(Double,Double) -#if CUDA_VERSION >= 7050 +#ifdef CUDA_HALF_TENSOR #define FLOAT_COPY(TYPE) TH_CONCAT_3(TH, CReal, Storage_copyCudaFloat) void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaHalfStorage *src) { @@ -77,7 +77,7 @@ void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *self, struct THCudaH } } #undef FLOAT_COPY -#endif // CUDA_VERSION >= 7050 +#endif // CUDA_HALF_TENSOR #undef THC_CUDA_STORAGE_IMPLEMENT_COPY diff --git a/lib/THC/generic/THCStorageCopy.h b/lib/THC/generic/THCStorageCopy.h index 3313290..c3e5601 100644 --- a/lib/THC/generic/THCStorageCopy.h +++ b/lib/THC/generic/THCStorageCopy.h @@ -21,7 +21,7 @@ THC_API void THCStorage_(copyCudaInt)(THCState *state, THCStorage *storage, stru THC_API void THCStorage_(copyCudaLong)(THCState *state, THCStorage *storage, struct THCudaLongStorage *src); THC_API void THCStorage_(copyCudaFloat)(THCState *state, THCStorage *storage, struct THCudaStorage *src); THC_API void THCStorage_(copyCudaDouble)(THCState *state, THCStorage *storage, struct THCudaDoubleStorage *src); -#if CUDA_VERSION >= 7050 +#ifdef CUDA_HALF_TENSOR THC_API void THCStorage_(copyCudaHalf)(THCState *state, THCStorage *storage, struct THCudaHalfStorage *src); #endif diff --git a/lib/THC/generic/THCTensor.c b/lib/THC/generic/THCTensor.c index 2f87f1a..e18044d 100644 --- a/lib/THC/generic/THCTensor.c +++ b/lib/THC/generic/THCTensor.c @@ -730,56 +730,56 @@ void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, lon self->nDimension = 0; } -void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, hostreal value) +void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, real value) { THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension"); THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range"); THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value); } -hostreal THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0) +real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0) { THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension"); THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range"); return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]); } -void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, hostreal value) +void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, real value) { THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range"); THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value); } -hostreal THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1) +real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1) { THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range"); return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]); } -void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, hostreal value) +void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, real value) { THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions"); THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range"); THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value); } -hostreal THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2) +real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2) { THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions"); THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range"); return THCStorage_(get)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]); } -void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, hostreal value) +void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, real value) { THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range"); THCStorage_(set)(state, tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value); } -hostreal THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3) +real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3) { THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions"); THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range"); diff --git a/lib/THC/generic/THCTensor.h b/lib/THC/generic/THCTensor.h index 8e4d1a4..175eaee 100644 --- a/lib/THC/generic/THCTensor.h +++ b/lib/THC/generic/THCTensor.h @@ -112,15 +112,15 @@ THC_API void THCTensor_(free)(THCState *state, THCTensor *self); THC_API void THCTensor_(freeCopyTo)(THCState *state, THCTensor *self, THCTensor *dst); /* Slow access methods [check everything] */ -THC_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, hostreal value); -THC_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, hostreal value); -THC_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, hostreal value); -THC_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, hostreal value); - -THC_API hostreal THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0); -THC_API hostreal THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1); -THC_API hostreal THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2); -THC_API hostreal THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3); +THC_API void THCTensor_(set1d)(THCState *state, THCTensor *tensor, long x0, real value); +THC_API void THCTensor_(set2d)(THCState *state, THCTensor *tensor, long x0, long x1, real value); +THC_API void THCTensor_(set3d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, real value); +THC_API void THCTensor_(set4d)(THCState *state, THCTensor *tensor, long x0, long x1, long x2, long x3, real value); + +THC_API real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, long x0); +THC_API real THCTensor_(get2d)(THCState *state, const THCTensor *tensor, long x0, long x1); +THC_API real THCTensor_(get3d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2); +THC_API real THCTensor_(get4d)(THCState *state, const THCTensor *tensor, long x0, long x1, long x2, long x3); /* CUDA-specific functions */ THC_API cudaTextureObject_t THCTensor_(getTextureObject)(THCState *state, THCTensor *self); diff --git a/lib/THC/generic/THCTensorCopy.c b/lib/THC/generic/THCTensorCopy.c index 68f57bf..e0bcadd 100644 --- a/lib/THC/generic/THCTensorCopy.c +++ b/lib/THC/generic/THCTensorCopy.c @@ -126,7 +126,6 @@ IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Long) IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Float) IMPLEMENT_TH_CUDA_TENSOR_COPY_TO(Double) -// FIXME: add within-CUDA conversions void THCTensor_(copyCuda)(THCState *state, THCTensor *self, THCTensor *src) { THCTensor_(copy)(state, self, src); diff --git a/lib/THC/generic/THCTensorCopy.cu b/lib/THC/generic/THCTensorCopy.cu index c5768e2..4198025 100644 --- a/lib/THC/generic/THCTensorCopy.cu +++ b/lib/THC/generic/THCTensorCopy.cu @@ -4,251 +4,44 @@ THC_API void THCTensor_(copy)(THCState* state, THCTensor* dst, THCTensor* src) { - long totalElements = THCTensor_(nElement)(state, dst); - - THArgCheck(totalElements == THCTensor_(nElement)(state, src), 2, - "sizes do not match"); - - if (THCTensor_(nDimension)(state, dst) == 0) { - // Zero-dim tensor; copy nothing - return; - } - - // We can memcpy the memory if: - // -both tensors are contiguous; or, - // -there is only one element to copy; or, - // -FIXME: if both tensors have matching size and stride arrays, and no - // holes within (in other words, there is some permutation that can be applied - // to the size/strides such that the resulting tensor is contiguous). - bool srcContig = THCTensor_(isContiguous)(state, src); - bool dstContig = THCTensor_(isContiguous)(state, dst); - bool memcpyEligible = (srcContig && dstContig) || (totalElements == 1); - - int srcDev = THCTensor_(getDevice)(state, src); - int dstDev = THCTensor_(getDevice)(state, dst); - int oldDev = curGPU(); - - // We always perform the copy on the source device, using the - // current stream on the source device. - // If the copy is on the default stream, then we fully synchronize - // both src and dst's default streams for completion of the - // copy. We have to explicitly do this for non-contig copies. - // This mimics the behavior of cross-device cudaMemcpyAsync on - // the default stream. - // If the copy is not on the default stream, then it is up to the - // user to add needed synchronization on the dst device, since the - // stream on the dst device that wishes to synchronize may not be - // the same index as the one on the src device. - int copyStreamIndex = - THCState_getCurrentStreamIndex(state); - cudaStream_t copyStream = - THCState_getDeviceStream(state, srcDev, copyStreamIndex); - - if (srcDev != dstDev && copyStreamIndex == 0) { - // This is a cross-device copy on the default stream. We perform a - // two-way barrier between both devices' default streams before - // the copy. This ensures that any write-after-write and - // write-after-read dependencies on the destination side are - // handled, so that no one is operating on the dst memory when - // we perform the copy. - // src waits on dst barrier (src already waits on src) - cudaEvent_t dstReady; - THCudaCheck(cudaSetDevice(dstDev)); - THCudaCheck(cudaEventCreateWithFlags(&dstReady, cudaEventDisableTiming)); - THCudaCheck(cudaEventRecord(dstReady, NULL)); - - THCudaCheck(cudaSetDevice(srcDev)); - THCudaCheck(cudaStreamWaitEvent(NULL, dstReady, 0)); - THCudaCheck(cudaEventDestroy(dstReady)); - } else if (srcDev != oldDev) { - THCudaCheck(cudaSetDevice(srcDev)); - } - - // We are now on srcDev - if (memcpyEligible) { - // Perform the copy - THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, dst), - THCTensor_(data)(state, src), - totalElements * sizeof(real), - cudaMemcpyDeviceToDevice, - copyStream)); - } else { -#if defined(THC_REAL_IS_FLOAT) - // Non-contiguous copy - - // We avoid creating temporary memory copies if possible. - // If both src and dst are on the same device, or if they are on - // different devices and p2p access is enabled, perform the copy - // by a pointwise copy kernel. - // Otherwise, we'll have to make contiguous (which will in fact - // invoke copy() again), and then perform the copy. - // FIXME: might want to consider only running the pointwise kernel - // if both src and dst innermost dimensions are contiguous. If - // they are not, then taking the hit of the memory allocation/free - // might be worth it to avoid non-coalesced reads or writes. - - // A device always has access to itself, so this also handles the - // case srcDev == dstDev - if (THCState_getPeerToPeerAccess(state, srcDev, dstDev)) { - // Make sure we have the current stream set in THCState, since - // pointwise uses that - if (srcDev != oldDev) { - THCState_setStream(state, srcDev, copyStreamIndex); - } - - bool succ = - THCudaTensor_pointwiseApply2(state, dst, src, CopyOp<float>()); - THArgCheck(succ, 2, CUTORCH_DIM_WARNING); - - // Restore prior THCState stream - if (srcDev != oldDev) { - THCState_setStream(state, oldDev, copyStreamIndex); - } - } else { - // GPUs can't access each other directly; fall back to - // newContiguous and memcpy - THCudaTensor* srcContig = THCudaTensor_newContiguous(state, src); - THCudaTensor* dstContig = dst; - - if (!THCudaTensor_isContiguous(state, dst)) { - // We are copying over the contents of dst, so we don't need - // to preserve its values. We just need a destination tensor - // the same size as dst. - - // Allocate the tensor on the new device - THCudaCheck(cudaSetDevice(dstDev)); - - dstContig = THCudaTensor_new(state); - THCudaTensor_resizeAs(state, dstContig, dst); - - THCudaCheck(cudaSetDevice(srcDev)); - } - - THCudaCheck(cudaMemcpyAsync(THCudaTensor_data(state, dstContig), - THCudaTensor_data(state, srcContig), - totalElements * sizeof(float), - cudaMemcpyDeviceToDevice, - copyStream)); - - THCudaTensor_free(state, srcContig); - - if (dst != dstContig) { - THCudaTensor_freeCopyTo(state, dstContig, dst); - } - } -#else -#define STRINGIFY(x) #x - THError("Non-contiguous copy not implemented for Cuda%sTensor", STRINGIFY(Real)); -#undef STRINGIFY -#endif - } - - if (srcDev != dstDev && copyStreamIndex == 0) { - // dst waits on src barrier (dst already waits on dst). We cannot - // operate on dst's copy until the copy is complete. - - // Still on srcDev, record default stream event - cudaEvent_t srcReady; - THCudaCheck(cudaEventCreateWithFlags(&srcReady, cudaEventDisableTiming)); - THCudaCheck(cudaEventRecord(srcReady, NULL)); - - THCudaCheck(cudaSetDevice(dstDev)); - THCudaCheck(cudaStreamWaitEvent(NULL, srcReady, 0)); - THCudaCheck(cudaEventDestroy(srcReady)); - - // We are now on dstDev (right above). Restore prior device from dst - if (dstDev != oldDev) { - THCudaCheck(cudaSetDevice(oldDev)); - } - } else { - // We are still on srcDev. Restore prior device from src - if (srcDev != oldDev) { - THCudaCheck(cudaSetDevice(oldDev)); - } - } + THC_copyTensor<THCTensor, THCTensor>(state, dst, src); +} - cudaError errcode = cudaGetLastError(); - if (errcode != cudaSuccess) { - THError(cudaGetErrorString(errcode)); - } +THC_API void +THCTensor_(copyIgnoringOverlaps)(THCState* state, THCTensor* dst, THCTensor* src) { + // Called when we are copying into an overlapping index `dst`, but + // we don't care which writer wins. Hacky but it works. + // This is itself invoked by pointwiseApply2 / THCTensor_copy in + // case that there are write overlaps. + // FIXME: really, overlapping writes should be illegal/an error in Torch + THC_pointwiseApply2( + state, dst, src, + CopyOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>(), + ReadOnly, /* ignore overwrites */ + ReadOnly); } -// conversions are mediated by the CPU -// yes, this is slow; feel free to write CUDA kernels for this -#ifndef THC_REAL_IS_HALF -#define THC_CUDA_TENSOR_IMPLEMENT_COPY(TYPEC,TYPECUDA) \ - void THCTensor_(copyCuda##TYPEC)(THCState *state, THCTensor *self, struct THCuda##TYPECUDA##Tensor *src) \ - { \ - if(THCTypeIdx_(Real) == THCTypeIdx_(TYPEC)) { \ - THCTensor_(copy)(state, self, (THCTensor*) src); /* cast just removes compiler warning */ \ - } else { \ - THArgCheck(THCTensor_(nElement)(state, self) == THCuda##TYPECUDA##Tensor_nElement(state, src), 2, "size does not match"); \ - THLongStorage *size = THCuda##TYPECUDA##Tensor_newSizeOf(state, src); \ - TH##TYPEC##Tensor *buffer1 = TH##TYPEC##Tensor_newWithSize(size, NULL); \ - THTensor *buffer2 = THTensor_(newWithSize)(size, NULL); \ - TH##TYPEC##Tensor_copyCuda(state, buffer1, src); \ - THTensor_(copy##TYPEC)(buffer2, buffer1); \ - THCTensor_(copyCPU)(state, self, buffer2); \ - THLongStorage_free(size); \ - TH##TYPEC##Tensor_free(buffer1); \ - THTensor_(free)(buffer2); \ - } \ - } -#else -#define THC_CUDA_TENSOR_IMPLEMENT_COPY(TYPEC,TYPECUDA) \ - void THCTensor_(copyCuda##TYPEC)(THCState *state, THCTensor *self, struct THCuda##TYPECUDA##Tensor *src) \ - { \ - THArgCheck(THCTensor_(nElement)(state, self) == THCuda##TYPECUDA##Tensor_nElement(state, src), 2, "size does not match"); \ - if (THCTypeIdx_(TYPEC) == THCTypeIdxFloat) { \ - THCudaTensor *csrc = THCudaTensor_newContiguous(state, (THCudaTensor*) src); /* cast removes compiler error */ \ - THCFloat2Half(state, \ - THCTensor_(data)(state, self), \ - THCudaTensor_data(state, csrc), \ - THCudaTensor_nElement(state, csrc)); \ - THCudaTensor_free(state, csrc); \ - } else { \ - THLongStorage *size = THCuda##TYPECUDA##Tensor_newSizeOf(state, src); \ - THCudaTensor *buffer = THCudaTensor_newWithSize(state, size, NULL); \ - THCudaTensor_copyCuda##TYPEC(state, buffer, src); \ - THCFloat2Half(state, \ - THCTensor_(data)(state, self), \ - THCudaTensor_data(state, buffer), \ - THCudaTensor_nElement(state, buffer)); \ - THCudaTensor_free(state, buffer); \ - THLongStorage_free(size); \ - } \ +#define IMPLEMENT_THC_CUDA_TENSOR_COPY(TYPEC, TYPECUDA) \ + THC_API void \ + THCTensor_(copyCuda##TYPEC)(THCState *state, \ + THCTensor *self, \ + THCuda##TYPECUDA##Tensor *src) { \ + THC_copyTensor<THCTensor, THCuda##TYPECUDA##Tensor>(state, self, src); \ } -#endif - -THC_CUDA_TENSOR_IMPLEMENT_COPY(Byte,Byte) -THC_CUDA_TENSOR_IMPLEMENT_COPY(Char,Char) -THC_CUDA_TENSOR_IMPLEMENT_COPY(Short,Short) -THC_CUDA_TENSOR_IMPLEMENT_COPY(Int,Int) -THC_CUDA_TENSOR_IMPLEMENT_COPY(Long,Long) -THC_CUDA_TENSOR_IMPLEMENT_COPY(Float,) // i.e. float -THC_CUDA_TENSOR_IMPLEMENT_COPY(Double,Double) -#if CUDA_VERSION >= 7050 -#define FLOAT_COPY(TYPE) TH_CONCAT_3(TH, CReal, Tensor_copyCudaFloat) -void THCTensor_(copyCudaHalf)(THCState *state, THCTensor *self, struct THCudaHalfTensor *src) -{ - if(THCTypeIdx_(Real) == THCTypeIdxHalf) { - THCTensor_(copy)(state, self, (THCTensor*) src); /* cast removes compiler error */ - } else { - THArgCheck(THCTensor_(nElement)(state, self) == THCudaHalfTensor_nElement(state, src), 2, "size does not match"); - src = THCudaHalfTensor_newContiguous(state, src); - THLongStorage *size = THCudaHalfTensor_newSizeOf(state, src); - THCudaTensor *buffer = THCudaTensor_newWithSize(state, size, NULL); - THCHalf2Float(state, THCudaTensor_data(state, buffer), THCudaHalfTensor_data(state, src), THCudaHalfTensor_nElement(state, src)); - FLOAT_COPY(Real)(state, self, buffer); - THCudaTensor_free(state, buffer); - THCudaHalfTensor_free(state, src); - THLongStorage_free(size); - } -} -#undef FLOAT_COPY -#endif // CUDA_VERSION >= 7050 +IMPLEMENT_THC_CUDA_TENSOR_COPY(Byte, Byte) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Char, Char) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Short, Short) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Int, Int) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Long, Long) +// THCudaTensor aka the non-existent THCudaFloatTensor +IMPLEMENT_THC_CUDA_TENSOR_COPY(Float, ) +IMPLEMENT_THC_CUDA_TENSOR_COPY(Double, Double) +#ifdef CUDA_HALF_TENSOR +IMPLEMENT_THC_CUDA_TENSOR_COPY(Half, Half) +#endif -#undef THC_CUDA_TENSOR_IMPLEMENT_COPY +#undef IMPLEMENT_THC_CUDA_TENSOR_COPY #endif diff --git a/lib/THC/generic/THCTensorCopy.h b/lib/THC/generic/THCTensorCopy.h index 8a2837f..3c7649b 100644 --- a/lib/THC/generic/THCTensorCopy.h +++ b/lib/THC/generic/THCTensorCopy.h @@ -3,6 +3,7 @@ #else THC_API void THCTensor_(copy)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(copyIgnoringOverlaps)(THCState *state, THCTensor *self, THCTensor *src); THC_API void THCTensor_(copyByte)(THCState *state, THCTensor *self, THByteTensor *src); THC_API void THCTensor_(copyChar)(THCState *state, THCTensor *self, THCharTensor *src); THC_API void THCTensor_(copyShort)(THCState *state, THCTensor *self, THShortTensor *src); @@ -18,7 +19,7 @@ THC_API void THCTensor_(copyCudaInt)(THCState *state, THCTensor *storage, struct THC_API void THCTensor_(copyCudaLong)(THCState *state, THCTensor *storage, struct THCudaLongTensor *src); THC_API void THCTensor_(copyCudaFloat)(THCState *state, THCTensor *storage, struct THCudaTensor *src); THC_API void THCTensor_(copyCudaDouble)(THCState *state, THCTensor *storage, struct THCudaDoubleTensor *src); -#if CUDA_VERSION >= 7050 +#ifdef CUDA_HALF_TENSOR THC_API void THCTensor_(copyCudaHalf)(THCState *state, THCTensor *storage, struct THCudaHalfTensor *src); #endif diff --git a/lib/THC/generic/THCTensorMath.cu b/lib/THC/generic/THCTensorMath.cu new file mode 100644 index 0000000..a0e550a --- /dev/null +++ b/lib/THC/generic/THCTensorMath.cu @@ -0,0 +1,68 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMath.cu" +#else + +THC_API void +THCTensor_(fill)(THCState* state, THCTensor *self_, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 1, self_)); + + if (!THC_pointwiseApply1( + state, self_, TensorFillOp<real>(value))) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(zero)(THCState *state, THCTensor *self_) +{ + THAssert(THCTensor_(checkGPU)(state, 1, self_)); + if (THCTensor_(isContiguous)(state, self_)) { + THCudaCheck(cudaMemsetAsync(THCTensor_(data)(state, self_), + 0, + sizeof(real) * THCTensor_(nElement)(state, self_), + THCState_getCurrentStream(state))); + } else { + if (!THC_pointwiseApply1( + state, self_, + TensorFillOp<real>(ScalarConvert<int, real>::to(0)))) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size) +{ + THAssert(THCTensor_(checkGPU)(state, 1, r_)); + THCTensor_(resize)(state, r_, size, NULL); + THCTensor_(zero)(state, r_); +} + +THC_API void +THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size) +{ + THAssert(THCTensor_(checkGPU)(state, 1, r_)); + THCTensor_(resize)(state, r_, size, NULL); + THCTensor_(fill)(state, r_, ScalarConvert<int, real>::to(1)); +} + +THC_API void +THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size) +{ + THAssert(THCTensor_(checkGPU)(state, 2, r_, t)); + THCTensor_(resize)(state, r_, size, NULL); + THCTensor_(copy)(state, r_, t); +} + +long +THCTensor_(numel)(THCState *state, THCTensor *t) +{ + return THCTensor_(nElement)(state, t); +} + +#endif diff --git a/lib/THC/generic/THCTensorMath.h b/lib/THC/generic/THCTensorMath.h new file mode 100644 index 0000000..5c9e66d --- /dev/null +++ b/lib/THC/generic/THCTensorMath.h @@ -0,0 +1,13 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMath.h" +#else + +THC_API void THCTensor_(fill)(THCState *state, THCTensor *self, real value); +THC_API void THCTensor_(zero)(THCState *state, THCTensor *self); + +THC_API void THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size); +THC_API void THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size); +THC_API void THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size); +THC_API long THCTensor_(numel)(THCState *state, THCTensor *t); + +#endif diff --git a/lib/THC/generic/THCTensorMathPairwise.cu b/lib/THC/generic/THCTensorMathPairwise.cu new file mode 100644 index 0000000..4a5c09d --- /dev/null +++ b/lib/THC/generic/THCTensorMathPairwise.cu @@ -0,0 +1,74 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.cu" +#else + +THC_API void +THCTensor_(add)(THCState *state, THCTensor *self_, THCTensor *src_, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src_)); + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorAddConstantOp<real>(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorAddConstantOp<real>(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(sub)(THCState *state, THCTensor *self_, THCTensor *src_, real value) +{ + THCTensor_(add)(state, self_, src_, ScalarNegate<real>::to(value)); +} + +THC_API void +THCTensor_(mul)(THCState *state, THCTensor *self_, THCTensor *src_, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src_)); + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, TensorMulConstantOp<real>(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, TensorMulConstantOp<real>(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(div)(THCState* state, THCTensor *self_, THCTensor *src_, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src_)); + THArgCheck(value != ScalarConvert<int, real>::to(0), 3, "divide by zero"); + + if (self_ == src_) { + if (!THC_pointwiseApply1(state, self_, + TensorMulConstantOp<real>( + ScalarInv<real>::to(value)))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src_); + + if (!THC_pointwiseApply2(state, self_, src_, + TensorMulConstantOp<real>( + ScalarInv<real>::to(value)))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/lib/THC/generic/THCTensorMathPairwise.h b/lib/THC/generic/THCTensorMathPairwise.h new file mode 100644 index 0000000..9a83293 --- /dev/null +++ b/lib/THC/generic/THCTensorMathPairwise.h @@ -0,0 +1,10 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathPairwise.h" +#else + +THC_API void THCTensor_(add)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(sub)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, real value); +THC_API void THCTensor_(div)(THCState *state, THCTensor *self, THCTensor *src, real value); + +#endif diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu new file mode 100644 index 0000000..b6679cd --- /dev/null +++ b/lib/THC/generic/THCTensorMathPointwise.cu @@ -0,0 +1,157 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.cu" +#else + +THC_API void +THCTensor_(cadd)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + if (value == ScalarConvert<int, real>::to(1)) { + // self += src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorAddOp<real>())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + // self += value * src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorCAddOp<real>(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + if (value == ScalarConvert<int, real>::to(1)) { + // self = src1 + src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorAddOp<real>())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + // self = src1 + value * src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCAddOp<real>(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(csub)(THCState *state, THCTensor *self_, THCTensor* src1, real value, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + if (value == ScalarConvert<int, real>::to(1)) { + // self -= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorSubOp<real>())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + // self += -value * src2 + if (!THC_pointwiseApply2(state, self_, src2, + TensorCAddOp<real>( + ScalarNegate<real>::to(value)))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + if (value == ScalarConvert<int, real>::to(1)) { + // self = src1 - src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorSubOp<real>())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + // self = src1 - value * src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, + TensorCAddOp<real>( + ScalarNegate<real>::to(value)))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self *= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorMulOp<real>())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 * src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorMulOp<real>())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(cpow)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self = pow(self, src2) + if (!THC_pointwiseApply2(state, self_, src2, TensorCPowOp<real>())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = pow(src1, src2) + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorCPowOp<real>())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(cdiv)(THCState* state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THArgCheck(THCTensor_(nElement)(state, src1) == + THCTensor_(nElement)(state, src2), 3, "sizes do not match"); + + if (self_ == src1) { + // self *= src2 + if (!THC_pointwiseApply2(state, self_, src2, TensorDivOp<real>())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } else { + THCTensor_(resizeAs)(state, self_, src1); + + // self = src1 * src2 + if (!THC_pointwiseApply3(state, self_, src1, src2, TensorDivOp<real>())) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + } + + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/lib/THC/generic/THCTensorMathPointwise.h b/lib/THC/generic/THCTensorMathPointwise.h new file mode 100644 index 0000000..cfb3b14 --- /dev/null +++ b/lib/THC/generic/THCTensorMathPointwise.h @@ -0,0 +1,11 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathPointwise.h" +#else + +THC_API void THCTensor_(cadd)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2); +THC_API void THCTensor_(csub)(THCState *state, THCTensor *self, THCTensor *src1, real value, THCTensor *src2); +THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cpow)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); + +#endif |