#ifndef THC_TENSORSORT_CUH #define THC_TENSORSORT_CUH #include "THCReduceApplyUtils.cuh" #include "THCSortUtils.cuh" #include "THCTensorCopy.h" #include "THCTensorTypeUtils.cuh" #include #include #if CUDA_VERSION >= 7000 #include #endif template struct ThrustGTOp { __device__ bool operator()(const T& lhs, const T& rhs) const { return THCNumerics::gt(lhs, rhs); } }; template struct ThrustLTOp { __device__ bool operator()(const T& lhs, const T& rhs) const { return THCNumerics::lt(lhs, rhs); } }; // `base` is the base address of a tensor // For each slice (defined as a linear point of `out`, from 0 -> // (sliceSize - 1) * sliceStride, we fill that slice from `0` to // `sliceSize - 1`. template __global__ void fillSliceWithIndex(TensorInfo out, IndexType totalSlices, IndexType sliceSize, IndexType sliceStride) { IndexType slice = getLinearBlockId(); if (slice >= totalSlices) { return; } const unsigned long offset = IndexToOffset::get(slice, out); long* base = &out.data[offset]; for (long i = threadIdx.x; i < sliceSize; i += blockDim.x) { // Torch indices are 1-based (hence the +1) base[i * sliceStride] = i + TH_INDEX_BASE; } } // For slice sorting in Thrust; extracts a slice index from a linear // index and uses that for comparison struct SliceComp { SliceComp(long size) : sliceSize(size) {} __device__ bool operator()(const long& a, const long& b) const { // Since the slices are guaranteed to be innermost, the segment is // just via long division long segA = a / sliceSize; long segB = b / sliceSize; return segA < segB; } const long sliceSize; }; // For sorting in Thurst; extracts a within-slice index from a linear index struct GlobalIndexToPerSliceIndex { GlobalIndexToPerSliceIndex(long size) : sliceSize(size) {} __device__ inline void operator()(long& v) const { v = v % sliceSize + TH_INDEX_BASE; } const long sliceSize; }; unsigned long nextHighestPowerOf2(unsigned long n); void THCudaLongTensor_fillSliceWithIndex(THCState* state, THCudaLongTensor* t, int dim); #endif // THC_TENSORSORT_CUH