lib/THC/THCDeviceTensor.cuh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504

#ifndef THC_DEVICE_TENSOR_INC
#define THC_DEVICE_TENSOR_INC

#include <cuda.h>
#include <cuda_runtime.h>

// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
template <bool>
struct THCStaticAssert;

template <>
struct THCStaticAssert<true> {
};

#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())

/// Our tensor type
template <typename T,
          int Dim,
          typename IndexT,
          template <typename U> class PtrTraits>
class THCDeviceTensor;

/// Type of a subspace of a tensor
namespace detail {
template <typename TensorType,
          int SubDim,
          template <typename U> class PtrTraits>
class THCDeviceSubTensor;
}

template <typename T>
struct RestrictPtrTraits {
  typedef T* __restrict__ PtrType;
};

template <typename T>
struct DefaultPtrTraits {
  typedef T* PtrType;
};

/**
   Templated multi-dimensional array that supports strided access of
   elements. Main access is through `operator[]`; e.g.,
   `tensor[x][y][z]`.

- `T` is the contained type (e.g., `float`)
- `Dim` is the tensor rank
- `IndexT` is the integer type used for size/stride arrays, and for
- all indexing math. Default is `int`, but for large tensors, `long`
- can be used instead.
- `PtrTraits` are traits applied to our data pointer (T*). By default,
- this is just T*, but RestrictPtrTraits can be used to apply T*
- __restrict__ for alias-free analysis.
*/
template <typename T,
          int Dim,
          typename IndexT = int,
          template <typename U> class PtrTraits = DefaultPtrTraits>
class THCDeviceTensor {
 public:
  enum { NumDim = Dim };
  typedef T DataType;
  typedef IndexT IndexType;
  typedef typename PtrTraits<T>::PtrType DataPtrType;
  typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;

  /// Default constructor
  __host__ __device__ THCDeviceTensor();

  /// Constructor that calculates strides with no padding
  __host__ __device__ THCDeviceTensor(DataPtrType data,
                                      const IndexT sizes[Dim]);

  /// Constructor that takes arbitrary size/stride arrays
  __host__ __device__ THCDeviceTensor(DataPtrType data,
                                      const IndexT sizes[Dim],
                                      const IndexT strides[Dim]);

  /// Returns true if the two tensors are of the same dimensionality,
  /// size and stride.
  template <int OtherDim>
  __host__ __device__ bool
  isSameSizeAndStride(
    const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;

  /// Cast to a tensor of a different type of the same size and stride
  template <typename U>
  __host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();

  /// Const version of `cast`
  template <typename U>
  __host__ __device__
  const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;

  /// Returns a raw pointer to the start of our data.
  __host__ __device__ __forceinline__ DataPtrType data() {
    return data_;
  }

  /// Returns a raw pointer to the start of our data (const).
  __host__ __device__ __forceinline__
  const DataPtrType data() const {
    return data_;
  }

  /// Cast to a different datatype
  template <typename U>
  __host__ __device__ __forceinline__
  typename PtrTraits<U>::PtrType dataAs() {
    return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
  }

  /// Cast to a different datatype
  template <typename U>
  __host__ __device__ __forceinline__
  const typename PtrTraits<const U>::PtrType dataAs() const {
    return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
  }

  /// Returns a read/write view of a portion of our tensor.
  __host__ __device__ __forceinline__
  detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
    operator[](IndexT);

  /// Returns a read/write view of a portion of our tensor (const).
  __host__ __device__ __forceinline__
  const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
    operator[](IndexT) const;

  /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
  /// checking.
  __host__ __device__ __forceinline__ int getSize(int i) const {
    return size_[i];
  }

  /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
  /// checking.
  __host__ __device__ __forceinline__ int getStride(int i) const {
    return stride_[i];
  }

  /// Returns the total number of elements contained within our data
  /// (product of `getSize(i)`)
  __host__ __device__ long numElements() const;

  /// Returns the size array.
  __host__ __device__ __forceinline__ const IndexT* sizes() const {
    return size_;
  }

  /// Returns the stride array.
  __host__ __device__ __forceinline__ const IndexT* strides() const {
    return stride_;
  }

  /// Returns true if there is no padding within the tensor and no
  /// re-ordering of the dimensions.
  /// ~~~
  /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
  /// ~~~
  __host__ __device__ bool isContiguous() const;

  /// Returns whether a given dimension has only increasing stride
  /// from the previous dimension. A tensor that was permuted by
  /// exchanging size and stride only will fail this check.
  /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
  __host__ __device__ bool isConsistentlySized(int i) const;

  // Returns whether at each dimension `stride <= size`.
  // If this is not the case then iterating once over the size space will
  // touch the same memory locations multiple times.
  __host__ __device__ bool isConsistentlySized() const;

  /// Returns true if the given dimension index has no padding
  __host__ __device__ bool isContiguousDim(int i) const;

  /// Returns a tensor of the same dimension after transposing the two
  /// dimensions given. Does not actually move elements; transposition
  /// is made by permuting the size/stride arrays.
  /// If the dimensions are not valid, asserts.
  __host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
  transpose(int dim1, int dim2) const;

  /// Upcast a tensor of dimension `D` to some tensor of dimension
  /// D' > D by padding the leading dimensions by 1
  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
  template <int NewDim>
  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
  upcastOuter();

  /// Upcast a tensor of dimension `D` to some tensor of dimension
  /// D' > D by padding the lowest/most varying dimensions by 1
  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
  template <int NewDim>
  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
  upcastInner();

  /// Downcast a tensor of dimension `D` to some tensor of dimension
  /// D' < D by collapsing the leading dimensions. asserts if there is
  /// padding on the leading dimensions.
  template <int NewDim>
  __host__ __device__
  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();

  /// Downcast a tensor of dimension `D` to some tensor of dimension
  /// D' < D by collapsing the leading dimensions. asserts if there is
  /// padding on the leading dimensions.
  template <int NewDim>
  __host__ __device__
  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();

  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
  /// of this tensor, starting at `at`.
  template <int SubDim>
  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
  view(DataPtrType at);

  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
  /// of this tensor, starting where our data begins
  template <int SubDim>
  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
  view();

  /// Zeroes out the tensor asynchronously. Asserts if the contents
  /// in question are not contiguous.
  void zero(cudaStream_t stream = 0);

 private:
  /// Raw pointer to where the tensor data begins
  DataPtrType data_;

  /// Array of strides (in sizeof(T) terms) per each dimension
  IndexT stride_[Dim];

  /// Size per each dimension
  IndexT size_[Dim];
};

namespace detail {

/// Specialization for a view of a single value (0-dimensional)
template <typename TensorType, template <typename U> class PtrTraits>
class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
 public:
  __host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
  operator=(typename TensorType::DataType val) {
    *data_ = val;
    return *this;
  }

  // operator T&
  __host__ __device__ operator typename TensorType::DataType&() {
    return *data_;
  }

  // const operator T& returning const T&
  __host__ __device__ operator const typename TensorType::DataType&() const {
    return *data_;
  }

  // operator& returning T*
  __host__ __device__ typename TensorType::DataType* operator&() {
    return data_;
  }

  // const operator& returning const T*
  __host__ __device__ const typename TensorType::DataType* operator&() const {
    return data_;
  }

  /// Returns a raw accessor to our slice.
  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
    return data_;
  }

  /// Returns a raw accessor to our slice (const).
  __host__ __device__ __forceinline__
  const typename TensorType::DataPtrType data() const {
    return data_;
  }

  /// Cast to a different datatype.
  template <typename T>
  __host__ __device__ T& as() {
    return *dataAs<T>();
  }

  /// Cast to a different datatype (const).
  template <typename T>
  __host__ __device__ const T& as() const {
    return *dataAs<T>();
  }

  /// Cast to a different datatype
  template <typename T>
  __host__ __device__ __forceinline__
  typename PtrTraits<T>::PtrType dataAs() {
    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
  }

  /// Cast to a different datatype (const)
  template <typename T>
  __host__ __device__ __forceinline__
  typename PtrTraits<const T>::PtrType dataAs() const {
    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
  }

  /// Use the texture cache for reads
  __device__ __forceinline__ typename TensorType::DataType ldg() const {
#if __CUDA_ARCH__ >= 350
    return __ldg(data_);
#else
    return *data_;
#endif
  }

  /// Use the texture cache for reads; cast as a particular type
  template <typename T>
  __device__ __forceinline__ T ldgAs() const {
#if __CUDA_ARCH__ >= 350
    return __ldg(dataAs<T>());
#else
    return as<T>();
#endif
  }

  private:
  /// One dimension greater can create us
  friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;

  /// Our parent tensor can create us
  friend class THCDeviceTensor<typename TensorType::DataType,
                               1,
                               typename TensorType::IndexType,
                               PtrTraits>;

  __host__ __device__ __forceinline__ THCDeviceSubTensor(
    TensorType& t,
    typename TensorType::DataPtrType data)
      : tensor_(t),
        data_(data) {
  }

  /// The tensor we're referencing
  TensorType& tensor_;

  /// Where our value is located
  typename TensorType::DataPtrType const data_;
};

/// A `SubDim`-rank slice of a parent THCDeviceTensor
template <typename TensorType,
          int SubDim,
          template <typename U> class PtrTraits>
class THCDeviceSubTensor {
 public:
  /// Returns a view of the data located at our offset (the dimension
  /// `SubDim` - 1 tensor).
  __host__ __device__ __forceinline__
  THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
    operator[](typename TensorType::IndexType index) {
    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
      tensor_,
      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
  }

  /// Returns a view of the data located at our offset (the dimension
  /// `SubDim` - 1 tensor) (const).
  __host__ __device__ __forceinline__
  const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
    operator[](typename TensorType::IndexType index) const {
    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
      tensor_,
      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
  }

  // operator& returning T*
  __host__ __device__ typename TensorType::DataType* operator&() {
    return data_;
  }

  // const operator& returning const T*
  __host__ __device__ const typename TensorType::DataType* operator&() const {
    return data_;
  }

  /// Returns a raw accessor to our slice.
  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
    return data_;
  }

  /// Returns a raw accessor to our slice (const).
  __host__ __device__ __forceinline__
  const typename TensorType::DataPtrType data() const {
    return data_;
  }

  /// Cast to a different datatype.
  template <typename T>
  __host__ __device__ T& as() {
    return *dataAs<T>();
  }

  /// Cast to a different datatype (const).
  template <typename T>
  __host__ __device__ const T& as() const {
    return *dataAs<T>();
  }

  /// Cast to a different datatype
  template <typename T>
  __host__ __device__ __forceinline__
  typename PtrTraits<T>::PtrType dataAs() {
    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
  }

  /// Cast to a different datatype (const)
  template <typename T>
  __host__ __device__ __forceinline__
  typename PtrTraits<const T>::PtrType dataAs() const {
    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
  }

  /// Use the texture cache for reads
  __device__ __forceinline__ typename TensorType::DataType ldg() const {
#if __CUDA_ARCH__ >= 350
    return __ldg(data_);
#else
    return *data_;
#endif
  }

  /// Use the texture cache for reads; cast as a particular type
  template <typename T>
  __device__ __forceinline__ T ldgAs() const {
#if __CUDA_ARCH__ >= 350
    return __ldg(dataAs<T>());
#else
    return as<T>();
#endif
  }

  /// Returns a tensor that is a view of the SubDim-dimensional slice
  /// of this tensor, starting where our data begins
  THCDeviceTensor<typename TensorType::DataType,
               SubDim,
               typename TensorType::IndexType,
               PtrTraits> view() {
    return tensor_.template view<SubDim>(data_);
  }

 private:
  /// One dimension greater can create us
  friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;

  /// Our parent tensor can create us
  friend class
  THCDeviceTensor<typename TensorType::DataType,
               TensorType::NumDim,
               typename TensorType::IndexType,
               PtrTraits>;

  __host__ __device__ __forceinline__ THCDeviceSubTensor(
    TensorType& t,
    typename TensorType::DataPtrType data)
      : tensor_(t),
        data_(data) {
  }

  /// The tensor we're referencing
  TensorType& tensor_;

  /// The start of our sub-region
  typename TensorType::DataPtrType const data_;
};

} // namespace detail

template <typename T, int Dim,
          typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __forceinline__
detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
                        Dim - 1, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
      *this, data_)[index]);
}

template <typename T, int Dim,
          typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __forceinline__
const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
                              Dim - 1, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
      const_cast<TensorType&>(*this), data_)[index]);
}

#include "THCDeviceTensor-inl.cuh"

#endif // THC_DEVICE_TENSOR_INC