1 files changed, 358 insertions, 0 deletions
diff --git a/lib/THC/THCTensorMathReduce.cu b/lib/THC/THCTensorMathReduce.cu
new file mode 100644
index 0000000..27949ad
--- /dev/null
+++ b/lib/THC/THCTensorMathReduce.cu
@@ -0,0 +1,358 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCNumerics.cuh"
+#include "THCReduce.cuh"
+#include "THCReduceAll.cuh"
+#include <thrust/functional.h>
+
+// Reduction operators that support `half`, unlike Thrust
+template <typename InT, typename AccT>
+struct ReduceAdd {
+  inline __device__ AccT operator()(AccT a, InT b) const {
+    return a + (AccT) b;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct ReduceAdd<half, half> {
+  inline __device__ half operator()(half a, half b) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hadd(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half(fa + fb);
+#endif
+  }
+};
+
+template <>
+struct ReduceAdd<half, float> {
+  inline __device__ float operator()(float a, half b) const {
+    return a + __half2float(b);
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename InT, typename AccT>
+struct ReduceMultiply {
+  inline __device__ AccT operator()(AccT a, InT b) const {
+    return a * (AccT) b;
+  }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct ReduceMultiply<half, half> {
+  inline __device__ half operator()(half a, half b) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+    return __hmul(a, b);
+#else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half(fa * fb);
+#endif
+  }
+};
+
+template <>
+struct ReduceMultiply<half, float> {
+  inline __device__ float operator()(float a, half b) const {
+    return a * __half2float(b);
+  }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct ReduceMin {
+  inline __device__ T operator()(T a, T b) const {
+    return THCNumerics<T>::lt(a, b) ? a : b;
+  }
+};
+
+template <typename T>
+struct ReduceMax {
+  inline __device__ T operator()(T a, T b) const {
+    return THCNumerics<T>::gt(a, b) ? a : b;
+  }
+};
+
+struct LogicalAll {
+  inline __device__ unsigned char operator()(unsigned char x,
+                                             unsigned char y) const {
+    return (x && y);
+  }
+};
+
+struct LogicalAny {
+  inline __device__ unsigned char operator()(unsigned char x,
+                                             unsigned char y) const {
+    return (x || y);
+  }
+};
+
+
+THC_API int
+THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self) {
+  THAssert(THCudaByteTensor_checkGPU(state, 1, self));
+  unsigned char result;
+  if (!THC_reduceAll(state, self,
+                     thrust::identity<unsigned char>(),
+                     LogicalAll(),
+                     LogicalAll(),
+                     (unsigned char) 1, &result, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  return (int) result;
+}
+
+THC_API int
+THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self) {
+  THAssert(THCudaByteTensor_checkGPU(state, 1, self));
+  unsigned char result;
+  if (!THC_reduceAll(state, self,
+                     thrust::identity<unsigned char>(),
+                     LogicalAny(),
+                     LogicalAny(),
+                     (unsigned char) 0, &result, 0)) {
+    THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+  }
+
+  return (int) result;
+}
+
+
+#include <thrust/functional.h>
+
+/* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
+   These are useful when you not only have to do a reduction, but you might have
+   to preserve the location of contention (for example min/max operations).
+   The structure of the kernels follows the structure of the reduction kernels.
+*/
+template <typename K, typename Index, class BinaryFunction>
+__global__ void
+kernelTransformReduceOuterDimIndex(K *tgt1,
+                                   Index *tgt2,
+                                   K *src_,
+                                   unsigned num_orows,
+                                   unsigned num_irows,
+                                   unsigned row_size,
+                                   thrust::pair<K, Index> init,
+                                   BinaryFunction binary_op) {
+  for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x;
+         irow < num_irows;
+         irow += gridDim.y * blockDim.x) {
+      K *src = src_ + orow * row_size * num_irows + irow;
+      thrust::pair<K, Index> acc = init;
+
+      for (unsigned col = 0; col < row_size; ++col) {
+        // +1 for Lua index
+        acc = binary_op(thrust::make_pair<K, Index>(*src, col+1),
+                        acc);
+        src += num_irows;
+      }
+
+      tgt1[orow * num_irows + irow] = acc.first;
+      tgt2[orow * num_irows + irow] = acc.second;
+    }
+  }
+}
+
+template <typename TensorTypeK,
+          typename TensorTypeIndex,
+          typename BinaryFunction>
+__host__ void
+THC_transformReduceOuterDimIndex(THCState *state,
+                                 TensorTypeK *tgt1,
+                                 TensorTypeIndex *tgt2,
+                                 TensorTypeK *src,
+                                 long rdim,
+                                 const thrust::pair<
+                                 typename TensorUtils<TensorTypeK>::DataType,
+                                 typename TensorUtils<TensorTypeIndex>::DataType>& init,
+                                 BinaryFunction binary_op) {
+  unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+  unsigned num_orows = 1;
+  for (unsigned dim = 0; dim < rdim; dim++) {
+    num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+  }
+  unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, rdim);
+  unsigned num_irows = 1;
+  for (unsigned dim = rdim + 1; dim < ndim; dim++) {
+    num_irows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+  }
+
+  dim3 threads(min(512, num_irows));
+  unsigned maxGridDim = 1024;
+  dim3 grid(min(maxGridDim, num_orows),
+            min(maxGridDim, THCCeilDiv(num_irows, threads.x)));
+
+  kernelTransformReduceOuterDimIndex
+    <<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+      TensorUtils<TensorTypeK>::getData(state, tgt1),
+      TensorUtils<TensorTypeIndex>::getData(state, tgt2),
+      TensorUtils<TensorTypeK>::getData(state, src),
+      num_orows, num_irows, row_size, init, binary_op);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+/* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
+ *
+ * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
+ *
+ * - block.x is the innermost dimension, i.e. dimension 0;
+ * - block.y and grid.y make up dimension 1; and
+ * - grid.x and grid z are the remaining two outer dimensions (if any)
+ *
+ * Reduction along other dimensions is handled in a separate kernel.
+ */
+template <typename K, typename Index, class BinaryFunction>
+__global__ void
+kernelTransformReduceInnermostDimIndex(K *tgt1,
+                                       Index* tgt2,
+                                       K *src_,
+                                       unsigned num_rows,
+                                       unsigned row_size,
+                                       thrust::pair<K, Index> init,
+                                       BinaryFunction binary_op) {
+  __shared__ K sbuf[32][16 + 1]; // avoid bank conflict
+  __shared__ Index ibuf[32][16 + 1]; // avoid bank conflict
+
+  for (unsigned block_row = blockIdx.x * blockDim.y;
+       block_row < num_rows;
+       block_row += blockDim.y * gridDim.x) {
+    unsigned row = block_row + threadIdx.y;
+    thrust::pair<K, Index> acc = init;
+    if (row < num_rows) {
+      K *src = src_ + row * row_size;
+      // Sequential reduction within a thread.
+      for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
+        acc = binary_op(thrust::make_pair<K, Index>(src[col], col + 1), acc);
+      }
+    }
+
+    sbuf[threadIdx.y][threadIdx.x] = acc.first;
+    ibuf[threadIdx.y][threadIdx.x] = acc.second;
+
+    __syncthreads();
+
+    // Reduce intermediate values to single value.
+    K* sline = &sbuf[threadIdx.y][0];
+    Index* iline = &ibuf[threadIdx.y][0];
+    for (unsigned s = 8; s > 0; s >>= 1) {
+      if (row < num_rows && threadIdx.x < s) {
+        thrust::pair<K, Index> arg1 =
+          thrust::make_pair<K, Index>(sline[threadIdx.x], iline[threadIdx.x]);
+        thrust::pair<K, Index> arg2 =
+          thrust::make_pair<K, Index>(sline[threadIdx.x + s], iline[threadIdx.x + s]);
+        thrust::pair<K, Index> res = binary_op(arg1, arg2);
+
+        sline[threadIdx.x] = res.first;
+        iline[threadIdx.x] = res.second;
+      }
+      __syncthreads();
+    }
+
+    if (row < num_rows && threadIdx.x == 0) {
+      tgt1[row] = sline[0];
+      tgt2[row] = iline[0];
+    }
+    __syncthreads();
+  }
+}
+
+template <typename TensorTypeK,
+          typename TensorTypeIndex,
+          typename BinaryFunction>
+__host__ void
+THC_transformReduceInnermostDimIndex(THCState *state,
+                                     TensorTypeK *tgt1,
+                                     TensorTypeIndex *tgt2,
+                                     TensorTypeK *src,
+                                     const thrust::pair<
+                                     typename TensorUtils<TensorTypeK>::DataType,
+                                     typename TensorUtils<TensorTypeIndex>::DataType>& init,
+                                     BinaryFunction binary_op) {
+  unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+  unsigned num_rows = 1;
+  for (unsigned dim = 0; dim < ndim - 1; dim++) {
+    num_rows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+  }
+  unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, ndim - 1);
+
+  dim3 threads(16, 32);
+  dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
+
+  kernelTransformReduceInnermostDimIndex
+    <<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+      TensorUtils<TensorTypeK>::getData(state, tgt1),
+      TensorUtils<TensorTypeIndex>::getData(state, tgt2),
+      TensorUtils<TensorTypeK>::getData(state, src),
+      num_rows, row_size, init, binary_op);
+
+  THCudaCheck(cudaGetLastError());
+}
+
+template <typename TensorTypeK,
+          typename TensorTypeIndex,
+          typename BinaryFunction>
+void
+THC_reduceDimIndex(THCState *state,
+                   TensorTypeK *tgt1_,
+                   TensorTypeIndex *tgt2_,
+                   TensorTypeK *src,
+                   long dimension,
+                   const thrust::pair<
+                   typename TensorUtils<TensorTypeK>::DataType,
+                   typename TensorUtils<TensorTypeIndex>::DataType>& init,
+                   BinaryFunction binary_op)
+{
+  THArgCheck(dimension >= 0 &&
+             dimension < TensorUtils<TensorTypeK>::getDims(state, src),
+             3, "dimension out of range");
+
+  THLongStorage *dim = TensorUtils<TensorTypeK>::newSizeOf(state, src);
+  THLongStorage_set(dim, dimension, 1);
+  TensorUtils<TensorTypeK>::resize(state, tgt1_, dim, NULL);
+  TensorUtils<TensorTypeIndex>::resize(state, tgt2_, dim, NULL);
+  THLongStorage_free(dim);
+
+  TensorTypeK *tgt1 = TensorUtils<TensorTypeK>::newContiguous(state, tgt1_);
+  TensorTypeIndex *tgt2 = TensorUtils<TensorTypeIndex>::newContiguous(state, tgt2_);
+  src = TensorUtils<TensorTypeK>::newContiguous(state, src);
+
+  if (dimension == TensorUtils<TensorTypeK>::getDims(state, src) - 1) {
+    THC_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op);
+  } else {
+    THC_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op);
+  }
+
+  TensorUtils<TensorTypeK>::free(state, src);
+  TensorUtils<TensorTypeK>::freeCopyTo(state, tgt1, tgt1_);
+  TensorUtils<TensorTypeIndex>::freeCopyTo(state, tgt2, tgt2_);
+}
+
+template <typename T, typename Index>
+struct MaxValuePair {
+  __host__ __device__
+  thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
+                                    const thrust::pair<T, Index>& b) {
+    return THCNumerics<T>::ge(a.first, b.first) ? a : b;
+  }
+};
+
+template <typename T, typename Index>
+struct MinValuePair {
+  __host__ __device__
+  thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
+                                    const thrust::pair<T, Index>& b) {
+    return THCNumerics<T>::le(a.first, b.first) ? a : b;
+  }
+};
+
+#include "generic/THCTensorMathReduce.cu"
+#include "THCGenerateAllTypes.h"