lib/THC/THCTensorCopy.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233

#include "THCApply.cuh"
#include "THCHalf.h"
#include "THCNumerics.cuh"

inline int curGPU() {
  int curDev;
  THCudaCheck(cudaGetDevice(&curDev));
  return curDev;
}

// Copy operator for the pointwise apply kernel
template <typename TypeDst, typename TypeSrc>
struct CopyOp {
  __device__ __forceinline__ void operator()(TypeDst* dst, TypeSrc* src) {
#if __CUDA_ARCH__ >= 350
    *dst = ScalarConvert<TypeSrc, TypeDst>::to(__ldg(src));
#else
    *dst = ScalarConvert<TypeSrc, TypeDst>::to(*src);
#endif
  }
};

// Copy for the same type to the same type
template <typename TensorTypeDst, typename TensorTypeSrc>
void
THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) {
  long totalElements = TensorUtils<TensorTypeDst>::getNumElements(state, dst);

  THArgCheck(totalElements ==
             TensorUtils<TensorTypeSrc>::getNumElements(state, src),
             2, "sizes do not match");

  if (TensorUtils<TensorTypeDst>::getDims(state, dst) == 0) {
    // Zero-dim tensor; copy nothing
    return;
  }

  // We can memcpy the memory if:
  // -both tensors are contiguous; or,
  // -there is only one element to copy; or,
  // -FIXME: if both tensors have matching size and stride arrays, and no
  // holes within (in other words, there is some permutation that can be applied
  // to the size/strides such that the resulting tensor is
  // contiguous).
  // -AND: both tensors have the same type.
  bool sameType = isSameType<TensorTypeSrc, TensorTypeDst>();
  bool srcContig = TensorUtils<TensorTypeSrc>::isContiguous(state, src);
  bool dstContig = TensorUtils<TensorTypeDst>::isContiguous(state, dst);
  bool memcpyEligible =
    ((srcContig && dstContig) || (totalElements == 1)) && sameType;


  int srcDev = TensorUtils<TensorTypeSrc>::getDevice(state, src);
  int dstDev = TensorUtils<TensorTypeDst>::getDevice(state, dst);
  int oldDev = curGPU();

  // We always perform the copy on the source device, using the
  // current stream on the source device.
  // If the copy is on the default stream, then we fully synchronize
  // both src and dst's default streams for completion of the
  // copy. We have to explicitly do this for non-contig copies.
  // This mimics the behavior of cross-device cudaMemcpyAsync on
  // the default stream.
  // If the copy is not on the default stream, then it is up to the
  // user to add needed synchronization on the dst device, since the
  // stream on the dst device that wishes to synchronize may not be
  // the same index as the one on the src device.
  int copyStreamIndex =
    THCState_getCurrentStreamIndex(state);
  cudaStream_t copyStream =
    THCState_getDeviceStream(state, srcDev, copyStreamIndex);

  if (srcDev != dstDev && copyStreamIndex == 0) {
    // This is a cross-device copy on the default stream. We perform a
    // two-way barrier between both devices' default streams before
    // the copy. This ensures that any write-after-write and
    // write-after-read dependencies on the destination side are
    // handled, so that no one is operating on the dst memory when
    // we perform the copy.
    // src waits on dst barrier (src already waits on src)
    cudaEvent_t dstReady;
    THCudaCheck(cudaSetDevice(dstDev));
    THCudaCheck(cudaEventCreateWithFlags(&dstReady, cudaEventDisableTiming));
    THCudaCheck(cudaEventRecord(dstReady, NULL));

    THCudaCheck(cudaSetDevice(srcDev));
    THCudaCheck(cudaStreamWaitEvent(NULL, dstReady, 0));
    THCudaCheck(cudaEventDestroy(dstReady));
  } else if (srcDev != oldDev) {
    THCudaCheck(cudaSetDevice(srcDev));
  }

  // We are now on srcDev
  if (memcpyEligible) {
    // Perform the copy
    THCudaCheck(cudaMemcpyAsync(
                  TensorUtils<TensorTypeDst>::getData(state, dst),
                  TensorUtils<TensorTypeSrc>::getData(state, src),
                  totalElements *
                  sizeof(typename TensorUtils<TensorTypeDst>::DataType),
                  cudaMemcpyDeviceToDevice,
                  copyStream));
  } else {
    // Non-contiguous copy or a type-conversion copy

    // We avoid creating temporary memory copies if possible.
    // If both src and dst are on the same device, or if they are on
    // different devices and p2p access is enabled, perform the copy
    // by a pointwise copy kernel.
    // Otherwise, we'll have to make contiguous (which will in fact
    // invoke copy() again), and then perform the copy.
    // FIXME: might want to consider only running the pointwise kernel
    // if both src and dst innermost dimensions are contiguous. If
    // they are not, then taking the hit of the memory allocation/free
    // might be worth it to avoid non-coalesced reads or writes.

    // A device always has access to itself, so this also handles the
    // case srcDev == dstDev
    if (THCState_getPeerToPeerAccess(state, srcDev, dstDev)) {
      // Make sure we have the current stream set in THCState, since
      // pointwise uses that
      if (srcDev != oldDev) {
        THCState_setStream(state, srcDev, copyStreamIndex);
      }

      bool succ =
        THC_pointwiseApply2(
          state, dst, src,
          CopyOp<typename TensorUtils<TensorTypeDst>::DataType,
                 typename TensorUtils<TensorTypeSrc>::DataType>());

      // Restore prior THCState stream
      if (srcDev != oldDev) {
        THCState_setStream(state, oldDev, copyStreamIndex);
      }

      THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
    } else {
      // GPUs can't access each other directly, but the tensors
      // involved are non-contiguous and/or are different types.

      // Make sure the src is contiguous and in the same type as dst
      THCudaCheck(cudaSetDevice(srcDev));
      TensorTypeDst* srcContig = NULL;

      if (sameType) {
        srcContig =
          (TensorTypeDst*) // this is actually the same type as src
          TensorUtils<TensorTypeSrc>::newContiguous(state, src);

      } else {
        // Types are different
        // Copy into the new format, contiguous, on the source device
        srcContig = TensorUtils<TensorTypeDst>::newTensor(state);
        TensorUtils<TensorTypeDst>::resizeAs(state, srcContig, dst);

        if (srcDev != oldDev) {
          THCState_setStream(state, srcDev, copyStreamIndex);
        }

        bool succ =
          THC_pointwiseApply2(
            state, srcContig, src,
            CopyOp<typename TensorUtils<TensorTypeDst>::DataType,
                   typename TensorUtils<TensorTypeSrc>::DataType>());

        // Restore prior THCState stream
        if (srcDev != oldDev) {
          THCState_setStream(state, oldDev, copyStreamIndex);
        }

        THArgCheck(succ, 2, CUTORCH_DIM_WARNING);
      }

      // Make sure the dst is contiguous
      THCudaCheck(cudaSetDevice(dstDev));
      TensorTypeDst* dstContig =
        TensorUtils<TensorTypeDst>::newContiguous(state, dst);

      // Now, we are ready for a cross-device memcpy of contiguous
      // data, of the same layout and type
      THCudaCheck(cudaSetDevice(srcDev));

      THCudaCheck(cudaMemcpyAsync(
                    TensorUtils<TensorTypeDst>::getData(state, dstContig),
                    TensorUtils<TensorTypeDst>::getData(state, srcContig),
                    totalElements *
                    sizeof(typename TensorUtils<TensorTypeDst>::DataType),
                    cudaMemcpyDeviceToDevice,
                    copyStream));

      // We are done with the src
      TensorUtils<TensorTypeDst>::free(state, srcContig);

      if (dst != dstContig) {
        TensorUtils<TensorTypeDst>::freeCopyTo(state, dstContig, dst);
      } else {
        TensorUtils<TensorTypeDst>::free(state, dstContig);
      }

      // We're still on srcDev at this point
    }
  }

  if (srcDev != dstDev && copyStreamIndex == 0) {
    // dst waits on src barrier (dst already waits on dst). We cannot
    // operate on dst's copy until the copy is complete.

    // Still on srcDev, record default stream event
    cudaEvent_t srcReady;
    THCudaCheck(cudaEventCreateWithFlags(&srcReady, cudaEventDisableTiming));
    THCudaCheck(cudaEventRecord(srcReady, NULL));

    THCudaCheck(cudaSetDevice(dstDev));
    THCudaCheck(cudaStreamWaitEvent(NULL, srcReady, 0));
    THCudaCheck(cudaEventDestroy(srcReady));

    // We are now on dstDev (right above). Restore prior device from dst
    if (dstDev != oldDev) {
      THCudaCheck(cudaSetDevice(oldDev));
    }
  } else {
    // We are still on srcDev. Restore prior device from src
    if (srcDev != oldDev) {
      THCudaCheck(cudaSetDevice(oldDev));
    }
  }

  THCudaCheck(cudaGetLastError());
}

#include "generic/THCTensorCopy.cu"
#include "THCGenerateAllTypes.h"