#include namespace detail { template __host__ __device__ void copy(T to[N], T from[N]) { for (int i = 0; i < N; ++i) { to[i] = from[i]; } } } // namespace detail template class PtrTraits> __host__ __device__ THCDeviceTensor::THCDeviceTensor() : data_(NULL) { thc_static_assert(Dim > 0); for (int i = 0; i < Dim; ++i) { size_[i] = 0; stride_[i] = (IndexT) 1; } } template class PtrTraits> __host__ __device__ THCDeviceTensor:: THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim]) : data_(data) { thc_static_assert(Dim > 0); for (int i = 0; i < Dim; ++i) { size_[i] = sizes[i]; } stride_[Dim - 1] = (IndexT) 1; for (int i = Dim - 2; i >= 0; --i) { stride_[i] = stride_[i + 1] * sizes[i + 1]; } } template class PtrTraits> __host__ __device__ THCDeviceTensor::THCDeviceTensor( DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim]) : data_(data) { thc_static_assert(Dim > 0); for (int i = 0; i < Dim; ++i) { size_[i] = sizes[i]; stride_[i] = strides[i]; } } template class PtrTraits> template __host__ __device__ bool THCDeviceTensor::isSameSizeAndStride( const THCDeviceTensor& rhs) const { if (Dim != OtherDim) { return false; } for (int i = 0; i < Dim; ++i) { if (size_[i] != rhs.size_[i]) { return false; } if (stride_[i] != rhs.stride_[i]) { return false; } } return true; } template class PtrTraits> template __host__ __device__ THCDeviceTensor THCDeviceTensor::cast() { thc_static_assert(sizeof(U) == sizeof(T)); return THCDeviceTensor( reinterpret_cast(data_), size_, stride_); } template class PtrTraits> template __host__ __device__ const THCDeviceTensor THCDeviceTensor::cast() const { thc_static_assert(sizeof(U) == sizeof(T)); return THCDeviceTensor( reinterpret_cast(data_), size_, stride_); } template class PtrTraits> __host__ __device__ long THCDeviceTensor::numElements() const { long size = getSize(0); for (int i = 1; i < Dim; ++i) { size *= getSize(i); } return size; } template class PtrTraits> __host__ __device__ bool THCDeviceTensor::isContiguous() const { long prevSize = 1; for (int i = Dim - 1; i >= 0; --i) { if (getSize(i) != (IndexT) 1) { if (getStride(i) == prevSize) { prevSize *= getSize(i); } else { return false; } } } return true; } template class PtrTraits> __host__ __device__ bool THCDeviceTensor::isConsistentlySized(int i) const { if (i == 0 && getStride(i) > 0 && getSize(i) > 0) { return true; } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) && ((getStride(i - 1) / getStride(i)) >= getSize(i))) { return true; } return false; } template class PtrTraits> __host__ __device__ bool THCDeviceTensor::isConsistentlySized() const { for (int i = 0; i < Dim; ++i) { if (!isConsistentlySized(i)) { return false; } } return true; } template class PtrTraits> __host__ __device__ bool THCDeviceTensor::isContiguousDim(int i) const { return (i == Dim - 1) || // just in case ((i < Dim - 1) && ((getStride(i) / getStride(i + 1)) == getSize(i + 1))); } template class PtrTraits> __host__ __device__ THCDeviceTensor THCDeviceTensor::transpose(int dim1, int dim2) const { #ifdef __CUDA_ARCH__ // Device code assert(dim1 >= 0 && dim1 < Dim); assert(dim1 >= 0 && dim2 < Dim); #else // Host code if (dim1 < 0 || dim1 >= Dim) { THError("dim1 out of bounds"); } if (dim2 < 0 || dim2 >= Dim) { THError("dim2 out of bounds"); } #endif IndexT newSize[Dim]; IndexT newStride[Dim]; for (int i = 0; i < Dim; ++i) { newSize[i] = size_[i]; newStride[i] = stride_[i]; } IndexT tmp = newSize[dim1]; newSize[dim1] = newSize[dim2]; newSize[dim2] = tmp; tmp = newStride[dim1]; newStride[dim1] = newStride[dim2]; newStride[dim2] = tmp; return THCDeviceTensor(data_, newSize, newStride); } template class PtrTraits> template __host__ __device__ THCDeviceTensor THCDeviceTensor::upcastOuter() { // Can only create tensors of greater dimension thc_static_assert(NewDim > Dim); IndexT newSize[NewDim]; IndexT newStride[NewDim]; int shift = NewDim - Dim; for (int i = 0; i < NewDim; ++i) { if (i < shift) { // These are the extended dimensions newSize[i] = (IndexT) 1; newStride[i] = size_[0] * stride_[0]; } else { // Shift the remaining dimensions newSize[i] = size_[i - shift]; newStride[i] = stride_[i - shift]; } } return THCDeviceTensor( data_, newSize, newStride); } template class PtrTraits> template __host__ __device__ THCDeviceTensor THCDeviceTensor::upcastInner() { // Can only create tensors of greater dimension thc_static_assert(NewDim > Dim); IndexT newSize[NewDim]; IndexT newStride[NewDim]; for (int i = 0; i < NewDim; ++i) { if (i < Dim) { // Existing dimensions get copied over newSize[i] = size_[i]; newStride[i] = stride_[i]; } else { // Extended dimensions newSize[i] = (IndexT) 1; newStride[i] = (IndexT) 1; } } return THCDeviceTensor( data_, newSize, newStride); } template class PtrTraits> template __host__ __device__ THCDeviceTensor THCDeviceTensor::downcastOuter() { // Can only create tensors of lesser dimension thc_static_assert(NewDim < Dim); // We can't downcast non-contiguous tensors, since it leaves // garbage data in the tensor. The tensor needs to be contiguous // in all of the dimensions we are collapsing (no padding in // them). for (int i = 0; i < Dim - NewDim; ++i) { bool cont = isContiguousDim(i); #ifdef __CUDA_ARCH__ // Device code assert(cont); #else // Host code if (!cont) { THError("Can only downcast contiguous tensors"); } #endif } IndexT newSize[NewDim]; IndexT newStride[NewDim]; int ignoredDims = Dim - NewDim; IndexT collapsedSize = 1; for (int i = 0; i < Dim; ++i) { if (i < ignoredDims) { // Collapse these dimensions collapsedSize *= getSize(i); } else { // Non-collapsed dimensions if (i == ignoredDims) { // This is the first non-collapsed dimension newSize[i - ignoredDims] = collapsedSize * getSize(i); } else { // Subsequent non-collapsed dimensions newSize[i - ignoredDims] = getSize(i); } newStride[i - ignoredDims] = getStride(i); } } return THCDeviceTensor( data_, newSize, newStride); } template class PtrTraits> template __host__ __device__ THCDeviceTensor THCDeviceTensor::downcastInner() { // Can only create tensors of lesser dimension thc_static_assert(NewDim < Dim); // We can't downcast non-contiguous tensors, since it leaves // garbage data in the tensor. The tensor needs to be contiguous // in all of the dimensions we are collapsing (no padding in // them). for (int i = NewDim; i < Dim; ++i) { bool cont = isContiguousDim(i); #ifdef __CUDA_ARCH__ // Device code assert(cont); #else // Host code if (!cont) { THError("Can only downcast contiguous tensors"); } #endif } IndexT newSize[NewDim]; IndexT newStride[NewDim]; IndexT collapsedSize = 1; for (int i = Dim - 1; i >= 0; --i) { if (i >= NewDim) { // Collapse these dimensions collapsedSize *= getSize(i); } else { // Non-collapsed dimensions if (i == NewDim - 1) { // This is the first non-collapsed dimension newSize[i] = collapsedSize * getSize(i); newStride[i] = getStride(Dim - 1); } else { // Subsequent non-collapsed dimensions newSize[i] = getSize(i); newStride[i] = getStride(i); } } } return THCDeviceTensor( data_, newSize, newStride); } template class PtrTraits> template __host__ __device__ THCDeviceTensor THCDeviceTensor::view(DataPtrType at) { thc_static_assert(SubDim >= 1 && SubDim < Dim); IndexT viewSizes[SubDim]; IndexT viewStrides[SubDim]; for (int i = 0; i < SubDim; ++i) { viewSizes[i] = size_[Dim - SubDim + i]; viewStrides[i] = stride_[Dim - SubDim + i]; } return THCDeviceTensor( at, viewSizes, viewStrides); } template class PtrTraits> template __host__ __device__ THCDeviceTensor THCDeviceTensor::view() { return view(data_); } template class PtrTraits> void THCDeviceTensor::zero(cudaStream_t stream) { #ifdef __CUDA_ARCH__ assert(isContiguous()); #else if (!isContiguous()) { THError("fillAsync only works on contiguous data"); } #endif cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream); }