src/tensors/gpu/dropout.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

#include <cuda.h>
#include <curand.h>
#include <stdio.h>
#include <stdlib.h>

// clang-format off
#include "tensors/tensor_operators.h"
#include "tensors/gpu/backend.h"
// clang-format on

#define CUDA_CALL(x)                                  \
  do {                                                \
    if((x) != cudaSuccess) {                          \
      printf("Error at %s:%d\n", __FILE__, __LINE__); \
      exit(1);                                        \
    }                                                 \
  } while(0)

#define CURAND_CALL(x)                                \
  do {                                                \
    if((x) != CURAND_STATUS_SUCCESS) {                \
      printf("Error at %s:%d\n", __FILE__, __LINE__); \
      exit(1);                                        \
    }                                                 \
  } while(0)

namespace marian {
namespace gpu {

__global__ void gScale(float* data, int n, float p) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;

  while(index < n) {
    data[index] = (data[index] < p) / p;
    index += gridDim.x * blockDim.x;
  }
}

void Dropout(Tensor tensor, float p) {
  auto gpuBackend
      = std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
  curandGenerator_t gen = gpuBackend->getCurandGenerator();
  int n = tensor->size();
  CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));

  int numThreads = std::min(n, 512);
  int numBlocks = n / numThreads + (n % numThreads != 0);

  gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
}
}
}