Merge pull request #589 from killeent/random-refactor

Move random functions to generic
author: Soumith Chintala <soumith@gmail.com> 2016-11-12 01:56:39 +0300
committer: GitHub <noreply@github.com> 2016-11-12 01:56:39 +0300
commit: 71df98b3cbe84ce943e492732f036dd5526ed329 (patch)
tree: ce3451f81a0e6a313c91d4ce96a9a80c3057be44
parent: 751f6192eb8e58d1cf3c9ff52c7f38017e73b72d (diff)
parent: 6bb181eec14eb52a11321d3424c6e4b9de93861f (diff)
10 files changed, 810 insertions, 634 deletions
diff --git a/TensorMath.lua b/TensorMath.lua
index a5d436f..f94154d 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -894,6 +894,15 @@ for k, Tensor_ in pairs(handledTypenames) do
 	    {name=Tensor .. "Array"},
 	    {name="index", default=lastdimarray(2)}})
 
+    for _,f in ipairs({{name='geometric'},
+                       {name='bernoulli', a=0.5}}) do
+
+       wrap(f.name,
+            cname(f.name),
+            {{name=Tensor, returned=true},
+             {name='double', default=f.a}})
+    end
+
     wrap("nonzero",
          cname("nonzero"),
          {{name="CudaLongTensor", default=true, returned=true},
@@ -925,6 +934,40 @@ for k, Tensor_ in pairs(handledTypenames) do
                 {name = real},
                 {name=Tensor, method={default=1}}})
 
+     wrap("rand",
+          cname("rand"),
+          {{name=Tensor, default=true, returned=true, method={default='nil'}},
+           {name="LongArg"}})
+
+     wrap("randn",
+          cname("randn"),
+          {{name=Tensor, default=true, returned=true, method={default='nil'}},
+           {name="LongArg"}})
+
+     wrap("multinomial",
+          cname("multinomial"),
+          {{name=Tensor, default=true, returned=true, method={default='nil'}},
+           {name=Tensor},
+           {name="int"},
+           {name="boolean", default=false}})
+
+     for _,f in ipairs({{name='uniform', a=0, b=1},
+                        {name='cauchy', a=0, b=1},
+                        {name='normal', a=0, b=1},
+                        {name='logNormal', a=1, b=2}}) do
+
+        wrap(f.name,
+             cname(f.name),
+             {{name=Tensor, returned=true},
+              {name='double', default=f.a},
+              {name='double', default=f.b}})
+     end
+
+     wrap('exponential',
+          cname('exponential'),
+          {{name=Tensor, returned=true},
+           {name='double', default=nil}})
+
       wrap("norm",
            cname("normall"),
            {{name=Tensor},
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index e5c202a..51f568a 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -238,6 +238,7 @@ INSTALL(FILES
           THCTensorSort.cuh
           THCTensorInfo.cuh
           THCTensorTypeUtils.cuh
+          THCTensorRandom.cuh
           DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC")
 
 INSTALL(FILES
@@ -278,4 +279,6 @@ INSTALL(FILES
           generic/THCTensorSort.h
           generic/THCTensorSort.cu
           generic/THCDeviceTensorUtils.cu
+          generic/THCTensorRandom.h
+          generic/THCTensorRandom.cu
           DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h
index 759c9a3..7cbef32 100644
--- a/lib/THC/THCTensorMath.h
+++ b/lib/THC/THCTensorMath.h
@@ -53,8 +53,6 @@ THC_API void THCudaTensor_potrf(THCState *state, THCudaTensor *ra_, THCudaTensor
 THC_API void THCudaTensor_potrs(THCState *state, THCudaTensor *rb_, THCudaTensor *a, THCudaTensor *b);
 THC_API void THCudaTensor_qr(THCState *state, THCudaTensor *rq_, THCudaTensor *rr_, THCudaTensor *a);
 
-THC_API void THCudaTensor_rand(THCState *state, THCudaTensor *r_, THLongStorage *size);
-THC_API void THCudaTensor_randn(THCState *state, THCudaTensor *r_, THLongStorage *size);
 
 THC_API int THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self);
 THC_API int THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self);
diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu
index 9933b7e..7e6af9b 100644
--- a/lib/THC/THCTensorMath2.cu
+++ b/lib/THC/THCTensorMath2.cu
@@ -28,16 +28,3 @@ void THCudaTensor_atan2(THCState *state, THCudaTensor *self_, THCudaTensor *tx,
   THCudaCheck(cudaGetLastError());
 }
 
-void THCudaTensor_rand(THCState *state, THCudaTensor *r_, THLongStorage *size)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, r_));
-  THCudaTensor_resize(state, r_, size, NULL);
-  THCudaTensor_uniform(state, r_, 0, 1);
-}
-
-void THCudaTensor_randn(THCState *state, THCudaTensor *r_, THLongStorage *size)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, r_));
-  THCudaTensor_resize(state, r_, size, NULL);
-  THCudaTensor_normal(state, r_, 0, 1);
-}
diff --git a/lib/THC/THCTensorRandom.cu b/lib/THC/THCTensorRandom.cu
index 827692f..4493fe8 100644
--- a/lib/THC/THCTensorRandom.cu
+++ b/lib/THC/THCTensorRandom.cu
@@ -4,6 +4,7 @@
 #include "THCTensorCopy.h"
 #include "THCTensorMath.h"
 #include "THCReduceApplyUtils.cuh"
+#include "THCTensorRandom.cuh"
 
 #include <thrust/functional.h>
 #include <curand.h>
@@ -187,567 +188,53 @@ __host__ void THCRandom_setRNGState(THCState* state, THByteTensor *rng_state)
   memcpy(&gen->initial_seed, THByteTensor_data(rng_state) + states_size, seed_size);
 }
 
-#define GENERATE_KERNEL1(NAME, ARG1, CURAND_FUNC, TRANSFORM)                   \
-__global__ void NAME(curandStateMtgp32 *state, int size, float *result, ARG1)  \
+#define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM)               \
+__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1)  \
 {                                                                              \
   int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;                             \
   int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;                     \
   for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {      \
-    float x = CURAND_FUNC(&state[blockIdx.x]);                                 \
+    CURAND_T x = CURAND_FUNC(&state[blockIdx.x]);                                 \
     if (i < size) {                                                            \
-      x = TRANSFORM;                                                           \
-      result[i] = x;                                                           \
+      T y = TRANSFORM;                                                           \
+      result[i] = y;                                                           \
     }                                                                          \
   }                                                                            \
 }
 
-#define GENERATE_KERNEL2(NAME, ARG1, ARG2, CURAND_FUNC, TRANSFORM)                   \
-__global__ void NAME(curandStateMtgp32 *state, int size, float *result, ARG1, ARG2)  \
+#define GENERATE_KERNEL2(NAME, T, ARG1, ARG2, CURAND_T, CURAND_FUNC, TRANSFORM)                \
+__global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1, ARG2)  \
 {                                                                                    \
   int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;                                   \
   int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;                           \
   for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {            \
-    float x = CURAND_FUNC(&state[blockIdx.x]);                                       \
+    CURAND_T x = CURAND_FUNC(&state[blockIdx.x]);                                       \
     if (i < size) {                                                                  \
-      x = TRANSFORM;                                                                 \
-      result[i] = x;                                                                 \
+      T y = TRANSFORM;                                                                 \
+      result[i] = y;                                                                 \
     }                                                                                \
   }                                                                                  \
 }
 
-GENERATE_KERNEL2(generate_uniform, double a, double b, curand_uniform, x * (b-a) + a)
-GENERATE_KERNEL1(generate_bernoulli, double p, curand_uniform, (float)x <= p)
-GENERATE_KERNEL2(generate_normal, double mean, double stdv, curand_normal, (x * stdv) + mean)
-GENERATE_KERNEL1(generate_geometric, double p, curand_uniform, (log(1-x) / log(p)) + 1)
-GENERATE_KERNEL1(generate_exponential, double lambda, curand_uniform, (float)(-1. / lambda * log(1-x)))
-GENERATE_KERNEL2(generate_cauchy, double median, double sigma, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5))))
+GENERATE_KERNEL2(generate_uniform, float, double a, double b, float, curand_uniform, x * (b-a) + a)
+GENERATE_KERNEL2(generate_uniform, double, double a, double b, double, curand_uniform_double, x * (b-a) + a)
+GENERATE_KERNEL2(generate_uniform, half, double a, double b, float, curand_uniform, (ScalarConvert<float, half>::to(x * (b-a) + a)))
 
-#undef GENERATE_KERNEL1
-#undef GENERATE_KERNEL2
-
-/* Separate kernel because curand_log_normal gets extra parameters. */
-__global__ void generate_log_normal(curandStateMtgp32 *state, int size, float *result, float mean, float stddev)
-{
-  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
-  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
-    float x = curand_log_normal(&state[blockIdx.x], mean, stddev);
-    if (i < size) {
-      result[i] = x;
-    }
-  }
-}
-
-#define NUM_BLOCKS min((int)THCCeilDiv(size, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS)
-THC_API void THCudaTensor_uniform(THCState* state, THCudaTensor *self_, double a, double b)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, self_));
-  Generator* gen = THCRandom_getGenerator(state);
-  THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  ptrdiff_t size = THCudaTensor_nElement(state, self);
-  float *data = THCudaTensor_data(state, self);
-
-  generate_uniform<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->gen_states, size, data, a, b);
-
-  THCudaTensor_freeCopyTo(state, self, self_);
-};
-
-THC_API void THCudaTensor_bernoulli(THCState* state, THCudaTensor *self_, double p)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, self_));
-  Generator* gen = THCRandom_getGenerator(state);
-  THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  ptrdiff_t size = THCudaTensor_nElement(state, self);
-  float *data = THCudaTensor_data(state, self);
-
-  generate_bernoulli<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->gen_states, size, data, p);
-
-  THCudaTensor_freeCopyTo(state, self, self_);
-};
-
-THC_API void THCudaTensor_normal(THCState* state, THCudaTensor *self_, double mean, double stdv)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, self_));
-  Generator* gen = THCRandom_getGenerator(state);
-  THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  ptrdiff_t size = THCudaTensor_nElement(state, self);
-  float *data = THCudaTensor_data(state, self);
-
-  generate_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->gen_states, size, data, mean, stdv);
-
-  THCudaTensor_freeCopyTo(state, self, self_);
-};
-
-THC_API void THCudaTensor_logNormal(THCState* state, THCudaTensor *self_, double mean, double stdv)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, self_));
-  Generator* gen = THCRandom_getGenerator(state);
-
-  THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  ptrdiff_t size = THCudaTensor_nElement(state, self);
-  float *data = THCudaTensor_data(state, self);
-
-  generate_log_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->gen_states, size, data, mean, stdv);
-
-  THCudaTensor_freeCopyTo(state, self, self_);
-};
-
-THC_API void THCudaTensor_geometric(THCState* state, THCudaTensor *self_, double p)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, self_));
-  Generator* gen = THCRandom_getGenerator(state);
-
-  THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  ptrdiff_t size = THCudaTensor_nElement(state, self);
-  float *data = THCudaTensor_data(state, self);
-
-  generate_geometric<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->gen_states, size, data, p);
-
-  THCudaTensor_freeCopyTo(state, self, self_);
-};
-
-THC_API void THCudaTensor_exponential(THCState* state, THCudaTensor *self_, double lambda)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, self_));
-  Generator* gen = THCRandom_getGenerator(state);
-
-  THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  ptrdiff_t size = THCudaTensor_nElement(state, self);
-  float *data = THCudaTensor_data(state, self);
-
-  generate_exponential<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->gen_states, size, data, lambda);
-
-  THCudaTensor_freeCopyTo(state, self, self_);
-};
-
-THC_API void THCudaTensor_cauchy(THCState* state, THCudaTensor *self_, double median, double sigma)
-{
-  THAssert(THCudaTensor_checkGPU(state, 1, self_));
-  Generator* gen = THCRandom_getGenerator(state);
+GENERATE_KERNEL2(generate_normal, float, double mean, double stdv, float, curand_normal, (x * stdv) + mean)
+GENERATE_KERNEL2(generate_normal, double, double mean, double stdv, double, curand_normal_double, (x * stdv) + mean)
+GENERATE_KERNEL2(generate_normal, half, double mean, double stdv, float, curand_normal, (ScalarConvert<float, half>::to((x * stdv) + mean)))
 
-  THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  ptrdiff_t size = THCudaTensor_nElement(state, self);
-  float *data = THCudaTensor_data(state, self);
+GENERATE_KERNEL1(generate_exponential, float, double lambda, float, curand_uniform, (float)(-1. / lambda * log(1-x)))
+GENERATE_KERNEL1(generate_exponential, double, double lambda, double, curand_uniform_double, (double)(-1. / lambda * log(1-x)))
+GENERATE_KERNEL1(generate_exponential, half, double lambda, float, curand_uniform, (ScalarConvert<float, half>::to((float)(-1. / lambda * log(1-x)))))
 
-  generate_cauchy<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-      gen->gen_states, size, data, median, sigma);
+GENERATE_KERNEL2(generate_cauchy, float, double median, double sigma, float, curand_uniform, (float)(median + sigma * tan(M_PI*(x-0.5))))
+GENERATE_KERNEL2(generate_cauchy, double, double median, double sigma, double, curand_uniform_double, (double)(median + sigma * tan(M_PI*(x-0.5))))
+GENERATE_KERNEL2(generate_cauchy, half, double median, double sigma, float, curand_uniform, (ScalarConvert<float, half>::to((float)(median + sigma * tan(M_PI*(x-0.5))))))
 
-  THCudaTensor_freeCopyTo(state, self, self_);
-};
+#include "generic/THCTensorRandom.cu"
+#include "THCGenerateAllTypes.h"
 
-__device__ int binarySearchForMultinomial(float* dist,
-                                          int size,
-                                          float val) {
-  int start = 0;
-  int end = size;
-
-  while (end - start > 0) {
-    int mid = start + (end - start) / 2;
-
-    float midVal = dist[mid];
-    if (midVal < val) {
-      start = mid + 1;
-    } else {
-      end = mid;
-    }
-  }
-
-  if (start == size) {
-    // No probability mass or precision problems; just return the
-    // first element
-    start = 0;
-  }
-
-  return start;
-}
-
-// Normalizes the L1 norm of every row to 1; used by multinomial
-__global__ void renormRowsL1(float* dist, long rows, long cols) {
-  extern __shared__ float smem[];
-
-  for (long row = blockIdx.x; row < rows; row += gridDim.x) {
-    float sum = 0.0f;
-    for (long col = threadIdx.x; col < cols; col += blockDim.x) {
-      sum += dist[row * cols + col];
-    }
-
-    sum = reduceBlock(smem, blockDim.x, sum, thrust::plus<float>(), 0.0f);
-    if (threadIdx.x == 0) {
-      smem[0] = sum;
-    }
-    __syncthreads();
-
-    sum = smem[0];
-    if (sum > 0.0f) {
-      for (long col = threadIdx.x; col < cols; col += blockDim.x) {
-        dist[row * cols + col] /= sum;
-      }
-    }
-  }
-}
-
-void THCudaTensor_renormRows(struct THCState* state,
-                             THCudaTensor* t) {
-  THAssert(THCudaTensor_nDimension(state, t) == 2);
-  long rows = THCudaTensor_size(state, t, 0);
-  long cols = THCudaTensor_size(state, t, 1);
-
-  cudaDeviceProp* props = THCState_getCurrentDeviceProperties(state);
-  THAssert(props != NULL);
-
-  int numSM = props->multiProcessorCount;
-  int maxThreads = props->maxThreadsPerBlock;
-
-  dim3 grid(rows < numSM * 4 ? rows : numSM * 4);
-  dim3 block(cols < maxThreads ? cols : maxThreads);
-
-  renormRowsL1
-    <<<grid, block, block.x * sizeof(float),
-    THCState_getCurrentStream(state)>>>(THCudaTensor_data(state, t),
-                                        rows, cols);
-}
-
-__global__ void
-sampleMultinomialOnce(float* dest,
-                      long distributions,
-                      int categories,
-                      float* dist) {
-  extern __shared__ float smem[];
-
-  for (long curDist = blockIdx.x;
-       curDist < distributions; curDist += gridDim.x) {
-    // Each block handles one distribution
-    // First pass, find the total sum of the distribution
-    float sum = 0.0f;
-    for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) {
-      sum += dist[curDist * categories + cat];
-    }
-
-    // threadIdx.x == 0 has the sum value from this
-    sum = reduceBlock(smem, blockDim.x, sum, thrust::plus<float>(), 0.0f);
-
-    // Broadcast sum and sample value
-    if (threadIdx.x == 0) {
-      smem[0] = sum;
-      smem[1] = dest[curDist];
-    }
-    __syncthreads();
-
-    sum = smem[0];
-    float sample = smem[1];
-    __syncthreads();
-
-    if (sum == 0.0f || sample == 0.0f) {
-      // Choose the first element
-      if (threadIdx.x == 0) {
-        dest[curDist] = 1;
-      }
-
-      continue;
-    }
-
-    int chunks = THCCeilDiv(categories, (int) blockDim.x);
-    float prevHighProb = 0.0f;
-
-    for (int chunk = 0; chunk < chunks; ++chunk) {
-      // All threads in bounds load a value
-      int cat = chunk * blockDim.x + threadIdx.x;
-
-      float val =
-        cat < categories ? dist[curDist * categories + cat] / sum : 0.0f;
-      smem[threadIdx.x] = val;
-      __syncthreads();
-
-      // Perform an inclusive prefix sum of the shared memory contents
-      for (int offset = 1; offset < blockDim.x; offset *= 2) {
-        float val = 0.0f;
-
-        if (threadIdx.x >= offset) {
-          val = smem[threadIdx.x - offset] + smem[threadIdx.x];
-        }
-
-        __syncthreads();
-        if (threadIdx.x >= offset) {
-          smem[threadIdx.x] = val;
-        }
-        __syncthreads();
-      }
-
-      // Each thread will check to see if the sample falls in its
-      // bucket
-      float curBucket =
-        smem[threadIdx.x] + prevHighProb;
-      float prevBucket =
-        threadIdx.x == 0 ? prevHighProb : smem[threadIdx.x - 1] + prevHighProb;
-      bool inBucket =
-        (cat < categories) && (sample <= curBucket) && (sample > prevBucket);
-
-      if (inBucket) {
-        // We're done; we have the sample
-        // Torch indices are 1-based
-        // FIXME: broadcast exit flag?
-        dest[curDist] = cat + TH_INDEX_BASE;
-      }
-
-      // Store the previous scan's high value for future use
-      prevHighProb += smem[blockDim.x - 1];
-
-      __syncthreads();
-    }
-  }
-}
-
-__global__ void
-sampleMultinomialWithReplacement(curandStateMtgp32* state,
-                                 int totalSamples,
-                                 float* dest,
-                                 long distributions,
-                                 int categories,
-                                 float* normDistPrefixSum) {
-  // At the moment, each warp computes one sample value in the binary
-  // search due to divergence. It seems possible to compute multiple
-  // values and limit divergence though later on. However, no matter
-  // what, all block threads must participate in the curand_uniform
-  // call to update the generator state.
-
-  // The block determines the distribution for which we generate a point
-  for (long curDist = blockIdx.x;
-       curDist < distributions;
-       curDist += gridDim.x) {
-    for (int sampleBase = 0;
-         sampleBase < totalSamples; sampleBase += blockDim.y) {
-      // The warp determines the sample
-      int sample = sampleBase + threadIdx.y;
-
-      // All threads participate in this
-      float r = curand_uniform(&state[blockIdx.x]);
-
-      if (threadIdx.x == 0 && sample < totalSamples) {
-        // Find the bucket that a uniform sample lies in
-        int choice = binarySearchForMultinomial(
-          normDistPrefixSum + curDist * categories,
-          categories,
-          r);
-
-        // Torch indices are 1-based
-        dest[curDist * totalSamples + sample] = (float) choice + (float)TH_INDEX_BASE;
-      }
-    }
-  }
-}
-
-__global__ void
-sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
-                                    int totalSamples,
-                                    int sample,
-                                    float* dest,
-                                    long distributions,
-                                    int categories,
-                                    float* origDist,
-                                    float* normDistPrefixSum) {
-  // At the moment, each warp computes one sample value in the binary
-  // search due to divergence. It seems possible to compute multiple
-  // values and limit divergence though later on. However, no matter
-  // what, all block threads must participate in the curand_uniform
-  // call to update the generator state.
-
-  // The block and warp determines the distribution for which we
-  // generate a point
-  for (long curDistBase = blockIdx.x * blockDim.y;
-       curDistBase < distributions;
-       curDistBase += gridDim.x * blockDim.y) {
-    // The warp determines the distribution
-    long curDist = curDistBase + threadIdx.y;
-
-    // All threads must participate in this
-    float r = curand_uniform(&state[blockIdx.x]);
-
-    if (threadIdx.x == 0 && curDist < distributions) {
-      // Find the bucket that a uniform sample lies in
-      int choice = binarySearchForMultinomial(
-        normDistPrefixSum + curDist * categories,
-        categories,
-        r);
-
-      // Torch indices are 1-based
-      dest[curDist * totalSamples + sample] = (float) choice + (float)TH_INDEX_BASE;
-
-      // Without replacement, so update the original probability so it
-      // is not considered a second time
-      origDist[curDist * categories + choice] = 0.0f;
-    }
-  }
-}
-
-THC_API void THCudaTensor_multinomial(struct THCState *state,
-                                      THCudaTensor *self,
-                                      THCudaTensor *prob_dist,
-                                      int n_sample,
-                                      int with_replacement)
-{
-  THAssert(THCudaTensor_checkGPU(state, 2, self, prob_dist));
-  Generator* gen = THCRandom_getGenerator(state);
-
-  int inputSize = THCudaTensor_nDimension(state, prob_dist);
-  THArgCheck(inputSize > 0 && inputSize <= 2, 2,
-             "prob_dist must be 1 or 2 dim");
-
-  // Categories are in the innermost dimension
-  long numDist =
-    inputSize == 1 ? 1 : THCudaTensor_size(state, prob_dist, 0);
-  long numCategoriesLong =
-    inputSize == 1 ? THCudaTensor_size(state, prob_dist, 0) :
-    THCudaTensor_size(state, prob_dist, 1);
-
-  // Since the index tensor is float, numCategories cannot exceed max
-  // float integer precision
-  THArgCheck(numCategoriesLong <= FLOAT32_MAX_CONSECUTIVE_INT, 2,
-             "number of categories cannot exceed 2^24");
-  int numCategories = (int) numCategoriesLong;
-
-  THArgCheck(n_sample > 0, 3, "cannot sample <= 0 samples");
-
-  if (!with_replacement) {
-    THArgCheck(n_sample <= numCategories, 2,
-               "cannot sample n_sample > prob_dist:size(1) samples without "
-               "replacement");
-  }
-
-  // It is possible that prob_dist is non-contiguous
-  THCudaTensor* probDistContig =
-    THCudaTensor_newContiguous(state, prob_dist);
-
-  // Restructure data for 2d
-  if (inputSize == 1) {
-    THCudaTensor_resize2d(state, probDistContig, 1, numCategories);
-  }
-
-  THCudaTensor_resize2d(state, self, numDist, n_sample);
-
-  if (n_sample == 1) {
-    // Optimized allocation-free implementation
-
-    // To exploit greater parallelism for the sampling, generate the
-    // Uniform random samples in a separate kernel launch, into the
-    // result memory. The device RNG is thread-limited
-    THCudaTensor_uniform(state, self, 0.0, 1.0);
-
-    cudaDeviceProp* props = THCState_getCurrentDeviceProperties(state);
-    THAssert(props != NULL);
-
-    int numSM = props->multiProcessorCount;
-    int maxThreads = props->maxThreadsPerBlock;
-
-    dim3 block(numCategories < maxThreads ? numCategories : maxThreads);
-    dim3 grid(numDist < numSM * 4 ? numDist : numSM * 4);
-
-    sampleMultinomialOnce
-      <<<grid, block, block.x * sizeof(float),
-         THCState_getCurrentStream(state)>>>(
-      THCudaTensor_data(state, self),
-      numDist,
-      numCategories,
-      THCudaTensor_data(state, probDistContig));
-  } else {
-    // Generic, slow implementation with memory allocations
-
-    // For sampling without replacement, we modify the distribution
-    // for subsequent samples in this space
-    THCudaTensor* origDist = THCudaTensor_new(state);
-    THCudaTensor_resizeAs(state, origDist, probDistContig);
-    THCudaTensor_copy(state, origDist, probDistContig);
-
-    THCudaTensor* normDist = THCudaTensor_new(state);
-    THCudaTensor_resizeAs(state, normDist, probDistContig);
-
-    THCudaTensor* prefixSum = THCudaTensor_new(state);
-
-    // Renorm along rows
-    THCudaTensor_copy(state, normDist, origDist);
-    THCudaTensor_renormRows(state, normDist);
-
-    // Prefix sum along rows
-    THCudaTensor_cumsum(state, prefixSum, normDist, 1);
-
-    if (with_replacement) {
-      // Sample with replacement
-
-      // Binary search is warp divergent (so effectively we're running
-      // with just a single thread), but for better utilization,
-      // we need each block to have at least 4 warps.
-      dim3 block(32, 4);
-
-      // Each warp in a block will generate a sample from one
-      // distribution concurrently.
-      dim3 grid(numDist < MAX_NUM_BLOCKS ? numDist : MAX_NUM_BLOCKS);
-
-      sampleMultinomialWithReplacement
-        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
-          gen->gen_states,
-          n_sample,
-          THCudaTensor_data(state, self),
-          numDist, numCategories,
-          THCudaTensor_data(state, prefixSum));
-    } else {
-      // Sample without replacement
-
-      // Binary search is warp divergent (so effectively we're running
-      // with just a single thread), but for better utilization,
-      // we need each block to have at least 4 warps.
-      dim3 block(32, 4);
-
-      // Each warp in a block will generate a sample from a different
-      // distribution concurrently.
-      ptrdiff_t numBlocks = THCCeilDiv(numDist, 4L);
-      dim3 grid(numBlocks < MAX_NUM_BLOCKS ? numBlocks : MAX_NUM_BLOCKS);
-
-      for (int sample = 0; sample < n_sample; ++sample) {
-        if (sample > 0) {
-          // Update probabilities
-          // Renorm along rows
-          THCudaTensor_copy(state, normDist, origDist);
-          THCudaTensor_renormRows(state, normDist);
-
-          // Prefix sum along rows
-          THCudaTensor_cumsum(state, prefixSum, normDist, 1);
-        }
-
-        // The kernel can only draw one sample before we have to
-        // recalculate our distribution
-        sampleMultinomialWithoutReplacement
-          <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
-            gen->gen_states,
-            n_sample,
-            sample,
-            THCudaTensor_data(state, self),
-            numDist, numCategories,
-            THCudaTensor_data(state, origDist),
-            THCudaTensor_data(state, prefixSum));
-      }
-    }
-
-    THCudaTensor_free(state, prefixSum);
-    THCudaTensor_free(state, normDist);
-    THCudaTensor_free(state, origDist);
-  }
-
-  // Revert data restructuring based on input sizes
-  if (inputSize == 1) {
-    THCudaTensor_resize1d(state, self, n_sample);
-
-    // Unfortunately, if prob_dist is contiguous already,
-    // newContiguous is not a private copy, so we have to restructure
-    // this too, so as to not affect prob_dist
-    THCudaTensor_resize1d(state, probDistContig, numCategories);
-  }
-
-  THCudaTensor_free(state, probDistContig);
-}
+#undef GENERATE_KERNEL1
+#undef GENERATE_KERNEL2
 
-#undef NUM_BLOCKS
diff --git a/lib/THC/THCTensorRandom.cuh b/lib/THC/THCTensorRandom.cuh
new file mode 100644
index 0000000..003e960
--- /dev/null
+++ b/lib/THC/THCTensorRandom.cuh
@@ -0,0 +1,278 @@
+#ifndef THC_TENSOR_RANDOM_CUH
+#define THC_TENSOR_RANDOM_CUH
+
+#include "THCNumerics.cuh"
+#include "THCReduceApplyUtils.cuh"
+#include "THCTensorMathReduce.cuh"
+
+#include <curand_kernel.h>
+
+#define MAX_NUM_BLOCKS 64
+#define BLOCK_SIZE 256
+/* Separate kernel because curand_log_normal gets extra parameters. */
+
+template <typename T>
+__global__ void generateLogNormal(curandStateMtgp32 *state, int size, T *result, double mean, double stddev)
+{
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+    float x = curand_log_normal(&state[blockIdx.x], mean, stddev);
+    if (i < size) {
+      result[i] = ScalarConvert<float, T>::to(x);
+    }
+  }
+}
+
+template <>
+__global__ void generateLogNormal<double>(curandStateMtgp32 *state, int size, double *result, double mean, double stddev)
+{
+  int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+  int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE;
+  for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) {
+    double x = curand_log_normal_double(&state[blockIdx.x], mean, stddev);
+    if (i < size) {
+      result[i] = x;
+    }
+  }
+}
+
+#undef MAX_NUM_BLOCKS
+#undef BLOCK_SIZE
+
+// Normalizes the L1 norm of every row to 1; used by multinomial
+template <typename T>
+__global__ void renormRowsL1(T* dist, long rows, long cols) {
+  extern __shared__ __align__(sizeof(T)) unsigned char my_smem[];
+  T *smem = reinterpret_cast<T *>(my_smem);
+
+  for (long row = blockIdx.x; row < rows; row += gridDim.x) {
+    T sum = ScalarConvert<int, T>::to(0);
+    for (long col = threadIdx.x; col < cols; col += blockDim.x) {
+      sum = THCNumerics<T>::add(sum, dist[row * cols + col]);
+    }
+
+    sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd<T, T>(), ScalarConvert<int, T>::to(0));
+    if (threadIdx.x == 0) {
+      smem[0] = sum;
+    }
+    __syncthreads();
+
+    sum = smem[0];
+    if (THCNumerics<T>::gt(sum, ScalarConvert<int, T>::to(0))) {
+      for (long col = threadIdx.x; col < cols; col += blockDim.x) {
+        dist[row * cols + col] = THCNumerics<T>::div(dist[row * cols + col], sum);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+sampleMultinomialOnce(T* dest,
+                      long distributions,
+                      int categories,
+                      T* dist) {
+  extern __shared__ __align__(sizeof(T)) unsigned char my_smem[];
+  T *smem = reinterpret_cast<T *>(my_smem);
+  T zero = ScalarConvert<int, T>::to(0);
+
+  for (long curDist = blockIdx.x;
+       curDist < distributions; curDist += gridDim.x) {
+    // Each block handles one distribution
+    // First pass, find the total sum of the distribution
+    T sum = zero;
+    for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) {
+      sum = THCNumerics<T>::add(sum, dist[curDist * categories + cat]);
+    }
+
+    // threadIdx.x == 0 has the sum value from this
+    sum = reduceBlock(smem, blockDim.x, sum, ReduceAdd<T, T>(), zero);
+
+    // Broadcast sum and sample value
+    if (threadIdx.x == 0) {
+      smem[0] = sum;
+      smem[1] = dest[curDist];
+    }
+    __syncthreads();
+
+    sum = smem[0];
+    T sample = smem[1];
+    __syncthreads();
+
+    if (THCNumerics<T>::eq(sum,  zero) || THCNumerics<T>::eq(sample, zero)) {
+      // Choose the first element
+      if (threadIdx.x == 0) {
+        dest[curDist] = ScalarConvert<int, T>::to(1);
+      }
+
+      continue;
+    }
+
+    int chunks = THCCeilDiv(categories, (int) blockDim.x);
+    T prevHighProb = zero;
+
+    for (int chunk = 0; chunk < chunks; ++chunk) {
+      // All threads in bounds load a value
+      int cat = chunk * blockDim.x + threadIdx.x;
+
+      T val =
+        cat < categories ? THCNumerics<T>::div(dist[curDist * categories + cat], sum) : 
+        zero;
+
+      smem[threadIdx.x] = val;
+      __syncthreads();
+
+      // Perform an inclusive prefix sum of the shared memory contents
+      for (int offset = 1; offset < blockDim.x; offset *= 2) {
+        T val = zero;
+
+        if (threadIdx.x >= offset) {
+          val = THCNumerics<T>::add(smem[threadIdx.x - offset], smem[threadIdx.x]);
+        }
+
+        __syncthreads();
+        if (threadIdx.x >= offset) {
+          smem[threadIdx.x] = val;
+        }
+        __syncthreads();
+      }
+
+      // Each thread will check to see if the sample falls in its
+      // bucket
+      T curBucket = THCNumerics<T>::add(smem[threadIdx.x], prevHighProb);
+      T prevBucket =
+        threadIdx.x == 0 ? prevHighProb :
+        THCNumerics<T>::add(smem[threadIdx.x - 1], prevHighProb);
+      bool inBucket =
+        (cat < categories) &&
+        (!THCNumerics<T>::gt(sample, curBucket)) &&
+        (THCNumerics<T>::gt(sample, prevBucket));
+
+      if (inBucket) {
+        // We're done; we have the sample
+        // Torch indices are 1-based
+        // FIXME: broadcast exit flag?
+        dest[curDist] = ScalarConvert<int, T>::to(cat + TH_INDEX_BASE);
+      }
+
+      // Store the previous scan's high value for future use
+      prevHighProb = THCNumerics<T>::add(prevHighProb, smem[blockDim.x - 1]);
+
+      __syncthreads();
+    }
+  }
+}
+
+template <typename T>
+__device__ int binarySearchForMultinomial(T* dist,
+                                          int size,
+                                          T val) {
+  int start = 0;
+  int end = size;
+
+  while (end - start > 0) {
+    int mid = start + (end - start) / 2;
+
+    T midVal = dist[mid];
+    if (THCNumerics<T>::lt(midVal, val)) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+
+  if (start == size) {
+    // No probability mass or precision problems; just return the
+    // first element
+    start = 0;
+  }
+
+  return start;
+}
+
+template <typename T>
+__global__ void
+sampleMultinomialWithReplacement(curandStateMtgp32* state,
+                                 int totalSamples,
+                                 T* dest,
+                                 long distributions,
+                                 int categories,
+                                 T* normDistPrefixSum) {
+  // At the moment, each warp computes one sample value in the binary
+  // search due to divergence. It seems possible to compute multiple
+  // values and limit divergence though later on. However, no matter
+  // what, all block threads must participate in the curand_uniform
+  // call to update the generator state.
+
+  // The block determines the distribution for which we generate a point
+  for (long curDist = blockIdx.x;
+       curDist < distributions;
+       curDist += gridDim.x) {
+    for (int sampleBase = 0;
+         sampleBase < totalSamples; sampleBase += blockDim.y) {
+      // The warp determines the sample
+      int sample = sampleBase + threadIdx.y;
+
+      // All threads participate in this
+      T r = ScalarConvert<float, T>::to(curand_uniform(&state[blockIdx.x]));
+
+      if (threadIdx.x == 0 && sample < totalSamples) {
+        // Find the bucket that a uniform sample lies in
+        int choice = binarySearchForMultinomial<T>(
+          normDistPrefixSum + curDist * categories,
+          categories,
+          r);
+
+        // Torch indices are 1-based
+        dest[curDist * totalSamples + sample] = ScalarConvert<int, T>::to(choice + TH_INDEX_BASE);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void
+sampleMultinomialWithoutReplacement(curandStateMtgp32* state,
+                                    int totalSamples,
+                                    int sample,
+                                    T* dest,
+                                    long distributions,
+                                    int categories,
+                                    T* origDist,
+                                    T* normDistPrefixSum) {
+  // At the moment, each warp computes one sample value in the binary
+  // search due to divergence. It seems possible to compute multiple
+  // values and limit divergence though later on. However, no matter
+  // what, all block threads must participate in the curand_uniform
+  // call to update the generator state.
+
+  // The block and warp determines the distribution for which we
+  // generate a point
+  for (long curDistBase = blockIdx.x * blockDim.y;
+       curDistBase < distributions;
+       curDistBase += gridDim.x * blockDim.y) {
+    // The warp determines the distribution
+    long curDist = curDistBase + threadIdx.y;
+
+    // All threads must participate in this
+    T r = ScalarConvert<float, T>::to(curand_uniform(&state[blockIdx.x]));
+
+    if (threadIdx.x == 0 && curDist < distributions) {
+      // Find the bucket that a uniform sample lies in
+      int choice = binarySearchForMultinomial<T>(
+        normDistPrefixSum + curDist * categories,
+        categories,
+        r);
+
+      // Torch indices are 1-based
+      dest[curDist * totalSamples + sample] = ScalarConvert<int, T>::to(choice + TH_INDEX_BASE);
+
+      // Without replacement, so update the original probability so it
+      // is not considered a second time
+      origDist[curDist * categories + choice] = ScalarConvert<int, T>::to(0);
+    }
+  }
+}
+
+#endif // THC_TENSOR_RANDOM_CUH
diff --git a/lib/THC/THCTensorRandom.h b/lib/THC/THCTensorRandom.h
index 93c0d77..12128cd 100644
--- a/lib/THC/THCTensorRandom.h
+++ b/lib/THC/THCTensorRandom.h
@@ -3,6 +3,9 @@
 
 #include "THCTensor.h"
 
+#include "generic/THCTensorRandom.h"
+#include "THCGenerateAllTypes.h"
+
 /* Generator */
 typedef struct _Generator {
   struct curandStateMtgp32* gen_states;
@@ -28,15 +31,6 @@ THC_API void THCRandom_manualSeedAll(struct THCState *state, unsigned long the_s
 THC_API unsigned long THCRandom_initialSeed(struct THCState *state);
 THC_API void THCRandom_getRNGState(struct THCState *state, THByteTensor *rng_state);
 THC_API void THCRandom_setRNGState(struct THCState *state, THByteTensor *rng_state);
-THC_API void THCudaTensor_geometric(struct THCState *state, THCudaTensor *self, double p);
-THC_API void THCudaTensor_bernoulli(struct THCState *state, THCudaTensor *self, double p);
-THC_API void THCudaTensor_uniform(struct THCState *state, THCudaTensor *self, double a, double b);
-THC_API void THCudaTensor_normal(struct THCState *state, THCudaTensor *self, double mean, double stdv);
-THC_API void THCudaTensor_exponential(struct THCState *state, THCudaTensor *self, double lambda);
-THC_API void THCudaTensor_cauchy(struct THCState *state, THCudaTensor *self, double median, double sigma);
-THC_API void THCudaTensor_logNormal(struct THCState *state, THCudaTensor *self, double mean, double stdv);
-
-THC_API void THCudaTensor_multinomial(struct THCState *state, THCudaTensor *self, THCudaTensor *prob_dist, int n_sample, int with_replacement);
 
 THC_API struct curandStateMtgp32* THCRandom_generatorStates(struct THCState* state);
 
diff --git a/lib/THC/generic/THCTensorRandom.cu b/lib/THC/generic/THCTensorRandom.cu
new file mode 100644
index 0000000..50aa2e8
--- /dev/null
+++ b/lib/THC/generic/THCTensorRandom.cu
@@ -0,0 +1,328 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorRandom.cu"
+#else
+
+#define NUM_BLOCKS min((int)THCCeilDiv(size, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS)
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(uniform)(THCState* state, THCTensor *self_, double a, double b)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_uniform<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, a, b);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(normal)(THCState* state, THCTensor *self_, double mean, double stdv)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, mean, stdv);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(logNormal)(THCState* state, THCTensor *self_, double mean, double stdv)
+{
+
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generateLogNormal<real><<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, mean, stdv);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(exponential)(THCState* state, THCTensor *self_, double lambda)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_exponential<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, lambda);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+THC_API void THCTensor_(cauchy)(THCState* state, THCTensor *self_, double median, double sigma)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_cauchy<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, median, sigma);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+void THCTensor_(renormRows)(struct THCState* state,
+                             THCTensor* t) {
+  THAssert(THCTensor_(nDimension)(state, t) == 2);
+  long rows = THCTensor_(size)(state, t, 0);
+  long cols = THCTensor_(size)(state, t, 1);
+
+  cudaDeviceProp* props = THCState_getCurrentDeviceProperties(state);
+  THAssert(props != NULL);
+
+  int numSM = props->multiProcessorCount;
+  int maxThreads = props->maxThreadsPerBlock;
+
+  dim3 grid(rows < numSM * 4 ? rows : numSM * 4);
+  dim3 block(cols < maxThreads ? cols : maxThreads);
+
+  renormRowsL1<real>
+    <<<grid, block, block.x * sizeof(real),
+    THCState_getCurrentStream(state)>>>(THCTensor_(data)(state, t),
+                                        rows, cols);
+}
+
+THC_API void THCTensor_(multinomial)(struct THCState *state,
+                                      THCTensor *self,
+                                      THCTensor *prob_dist,
+                                      int n_sample,
+                                      int with_replacement)
+{
+  THAssert(THCTensor_(checkGPU)(state, 2, self, prob_dist));
+  Generator* gen = THCRandom_getGenerator(state);
+
+  int inputSize = THCTensor_(nDimension)(state, prob_dist);
+  THArgCheck(inputSize > 0 && inputSize <= 2, 2,
+             "prob_dist must be 1 or 2 dim");
+
+  // Categories are in the innermost dimension
+  long numDist =
+    inputSize == 1 ? 1 : THCTensor_(size)(state, prob_dist, 0);
+  long numCategoriesLong =
+    inputSize == 1 ? THCTensor_(size)(state, prob_dist, 0) :
+    THCTensor_(size)(state, prob_dist, 1);
+
+  // Since the index tensor is float, numCategories cannot exceed max
+  // float integer precision
+  THArgCheck(numCategoriesLong <= FLOAT32_MAX_CONSECUTIVE_INT, 2,
+             "number of categories cannot exceed 2^24");
+  int numCategories = (int) numCategoriesLong;
+
+  THArgCheck(n_sample > 0, 3, "cannot sample <= 0 samples");
+
+  if (!with_replacement) {
+    THArgCheck(n_sample <= numCategories, 2,
+               "cannot sample n_sample > prob_dist:size(1) samples without "
+               "replacement");
+  }
+
+  // It is possible that prob_dist is non-contiguous
+  THCTensor* probDistContig =
+    THCTensor_(newContiguous)(state, prob_dist);
+
+  // Restructure data for 2d
+  if (inputSize == 1) {
+    THCTensor_(resize2d)(state, probDistContig, 1, numCategories);
+  }
+
+  THCTensor_(resize2d)(state, self, numDist, n_sample);
+
+  if (n_sample == 1) {
+    // Optimized allocation-free implementation
+
+    // To exploit greater parallelism for the sampling, generate the
+    // Uniform random samples in a separate kernel launch, into the
+    // result memory. The device RNG is thread-limited
+    THCTensor_(uniform)(state, self, 0.0, 1.0);
+
+    cudaDeviceProp* props = THCState_getCurrentDeviceProperties(state);
+    THAssert(props != NULL);
+
+    int numSM = props->multiProcessorCount;
+    int maxThreads = props->maxThreadsPerBlock;
+
+    dim3 block(numCategories < maxThreads ? numCategories : maxThreads);
+    dim3 grid(numDist < numSM * 4 ? numDist : numSM * 4);
+
+    sampleMultinomialOnce
+      <<<grid, block, block.x * sizeof(real),
+         THCState_getCurrentStream(state)>>>(
+      THCTensor_(data)(state, self),
+      numDist,
+      numCategories,
+      THCTensor_(data)(state, probDistContig));
+  } else {
+    // Generic, slow implementation with memory allocations
+
+    // For sampling without replacement, we modify the distribution
+    // for subsequent samples in this space
+    THCTensor* origDist = THCTensor_(new)(state);
+    THCTensor_(resizeAs)(state, origDist, probDistContig);
+    THCTensor_(copy)(state, origDist, probDistContig);
+
+    THCTensor* normDist = THCTensor_(new)(state);
+    THCTensor_(resizeAs)(state, normDist, probDistContig);
+
+    THCTensor* prefixSum = THCTensor_(new)(state);
+
+    // Renorm along rows
+    THCTensor_(copy)(state, normDist, origDist);
+    THCTensor_(renormRows)(state, normDist);
+
+    // Prefix sum along rows
+    THCTensor_(cumsum)(state, prefixSum, normDist, 1);
+
+    if (with_replacement) {
+      // Sample with replacement
+
+      // Binary search is warp divergent (so effectively we're running
+      // with just a single thread), but for better utilization,
+      // we need each block to have at least 4 warps.
+      dim3 block(32, 4);
+
+      // Each warp in a block will generate a sample from one
+      // distribution concurrently.
+      dim3 grid(numDist < MAX_NUM_BLOCKS ? numDist : MAX_NUM_BLOCKS);
+
+      sampleMultinomialWithReplacement
+        <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+          gen->gen_states,
+          n_sample,
+          THCTensor_(data)(state, self),
+          numDist, numCategories,
+          THCTensor_(data)(state, prefixSum));
+    } else {
+      // Sample without replacement
+
+      // Binary search is warp divergent (so effectively we're running
+      // with just a single thread), but for better utilization,
+      // we need each block to have at least 4 warps.
+      dim3 block(32, 4);
+
+      // Each warp in a block will generate a sample from a different
+      // distribution concurrently.
+      ptrdiff_t numBlocks = THCCeilDiv(numDist, 4L);
+      dim3 grid(numBlocks < MAX_NUM_BLOCKS ? numBlocks : MAX_NUM_BLOCKS);
+
+      for (int sample = 0; sample < n_sample; ++sample) {
+        if (sample > 0) {
+          // Update probabilities
+          // Renorm along rows
+          THCTensor_(copy)(state, normDist, origDist);
+          THCTensor_(renormRows)(state, normDist);
+
+          // Prefix sum along rows
+          THCTensor_(cumsum)(state, prefixSum, normDist, 1);
+        }
+
+        // The kernel can only draw one sample before we have to
+        // recalculate our distribution
+        sampleMultinomialWithoutReplacement
+          <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+            gen->gen_states,
+            n_sample,
+            sample,
+            THCTensor_(data)(state, self),
+            numDist, numCategories,
+            THCTensor_(data)(state, origDist),
+            THCTensor_(data)(state, prefixSum));
+      }
+    }
+
+    THCTensor_(free)(state, prefixSum);
+    THCTensor_(free)(state, normDist);
+    THCTensor_(free)(state, origDist);
+  }
+
+  // Revert data restructuring based on input sizes
+  if (inputSize == 1) {
+    THCTensor_(resize1d)(state, self, n_sample);
+
+    // Unfortunately, if prob_dist is contiguous already,
+    // newContiguous is not a private copy, so we have to restructure
+    // this too, so as to not affect prob_dist
+    THCTensor_(resize1d)(state, probDistContig, numCategories);
+  }
+
+  THCTensor_(free)(state, probDistContig);
+}
+
+THC_API void THCTensor_(rand)(THCState *state, THCTensor *r_, THLongStorage *size)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCTensor_(resize)(state, r_, size, NULL);
+  THCTensor_(uniform)(state, r_, 0, 1);
+}
+
+void THCTensor_(randn)(THCState *state, THCTensor *r_, THLongStorage *size)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, r_));
+  THCTensor_(resize)(state, r_, size, NULL);
+  THCTensor_(normal)(state, r_, 0, 1);
+}
+
+#endif
+
+#if defined(THC_REAL_IS_DOUBLE)
+GENERATE_KERNEL1(generate_bernoulli, double, double p, double, curand_uniform_double, x <= p)
+#else
+GENERATE_KERNEL1(generate_bernoulli, real, double p, float, curand_uniform, (ScalarConvert<bool, real>::to(x <= p)))
+#endif
+
+THC_API void THCTensor_(bernoulli)(THCState* state, THCTensor *self_, double p)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_bernoulli<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, p);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+
+#if defined(THC_REAL_IS_DOUBLE)
+
+GENERATE_KERNEL1(generate_geometric, double, double p, double, curand_uniform_double, floor((log(1-x) / log(p)) + 1))
+#else
+GENERATE_KERNEL1(generate_geometric, real, double p, float, curand_uniform, (ScalarConvert<float, real>::to(floorf((log(1-x) / log(p)) + 1))))
+#endif
+
+THC_API void THCTensor_(geometric)(THCState* state, THCTensor *self_, double p)
+{
+  THAssert(THCTensor_(checkGPU)(state, 1, self_));
+  Generator* gen = THCRandom_getGenerator(state);
+
+  THCTensor *self = THCTensor_(newContiguous)(state, self_);
+  ptrdiff_t size = THCTensor_(nElement)(state, self);
+  real *data = THCTensor_(data)(state, self);
+
+  generate_geometric<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+      gen->gen_states, size, data, p);
+
+  THCTensor_(freeCopyTo)(state, self, self_);
+};
+#undef NUM_BLOCKS
+
+#endif
diff --git a/lib/THC/generic/THCTensorRandom.h b/lib/THC/generic/THCTensorRandom.h
new file mode 100644
index 0000000..a2896c3
--- /dev/null
+++ b/lib/THC/generic/THCTensorRandom.h
@@ -0,0 +1,21 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorRandom.h"
+#else
+
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
+
+THC_API void THCTensor_(uniform)(struct THCState *state, THCTensor *self, double a, double b);
+THC_API void THCTensor_(rand)(THCState *state, THCTensor *r_, THLongStorage *size);
+THC_API void THCTensor_(randn)(THCState *state, THCTensor *r_, THLongStorage *size);
+THC_API void THCTensor_(normal)(struct THCState *state, THCTensor *self, double mean, double stdv);
+THC_API void THCTensor_(logNormal)(struct THCState *state, THCTensor *self, double mean, double stdv);
+THC_API void THCTensor_(exponential)(struct THCState *state, THCTensor *self, double lambda);
+THC_API void THCTensor_(cauchy)(struct THCState *state, THCTensor *self, double median, double sigma);
+THC_API void THCTensor_(multinomial)(struct THCState *state, THCTensor *self, THCTensor *prob_dist, int n_sample, int with_replacement);
+
+#endif
+
+THC_API void THCTensor_(bernoulli)(struct THCState *state, THCTensor *self, double p);
+THC_API void THCTensor_(geometric)(struct THCState *state, THCTensor *self, double p);
+
+#endif
diff --git a/test/test.lua b/test/test.lua
index 00cfa66..724d5ff 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -2526,8 +2526,11 @@ function test.uniform()
    local max = min + torch.uniform()
    local t = torch.CudaTensor(sz1, sz2)
 
-   t:uniform(min, max)
-   checkIfUniformlyDistributed(t, min, max)
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(typename)
+       x:uniform(min, max)
+       checkIfUniformlyDistributed(x, min, max)
+   end
    checkMultiDevice(t, 'uniform', min, max)
 end
 
@@ -2537,13 +2540,17 @@ function test.bernoulli()
    local p = torch.uniform()
    local t = torch.CudaTensor(sz1, sz2)
 
-   t:bernoulli(p)
-   tester:assertalmosteq(t:mean(), p, 0.1, "mean is not equal to p")
-   local f = t:float()
-   tester:assertTensorEq(f:eq(1):add(f:eq(0)):float(),
-                         torch.FloatTensor(sz1, sz2):fill(1),
-                         1e-6,
-                         "each value must be either 0 or 1")
+   for _, typename in ipairs(typenames) do
+       local x = t:type(typename)
+       x:bernoulli(p)
+       local mean = x:sum() / (sz1 * sz2)
+       tester:assertalmosteq(mean, p, 0.1, "mean is not equal to p")
+       local f = t:float()
+       tester:assertTensorEq(f:eq(1):add(f:eq(0)):float(),
+                             torch.FloatTensor(sz1, sz2):fill(1),
+                             1e-6,
+                             "each value must be either 0 or 1")
+   end
    checkMultiDevice(t, 'bernoulli', p)
 end
 
@@ -2554,9 +2561,13 @@ function test.normal()
    local tolerance = 0.01
    local t = torch.CudaTensor(sz1, sz2)
 
-   t:normal(mean, std)
-   tester:assertalmosteq(t:mean(), mean, tolerance, "mean is wrong")
-   tester:assertalmosteq(t:std(), std, tolerance, "standard deviation is wrong")
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(t2cpu[typename])
+       x:normal(mean, std)
+       tester:assertalmosteq(x:mean(), mean, tolerance, "mean is wrong")
+       tester:assertalmosteq(x:std(), std, tolerance, "standard deviation is wrong")
+   end
+
    checkMultiDevice(t, 'normal', mean, std)
 end
 
@@ -2567,10 +2578,13 @@ function test.logNormal()
    local tolerance = 0.01
    local t = torch.CudaTensor(sz1, sz2)
 
-   t:logNormal(mean, std)
-   local logt = t:log()
-   tester:assertalmosteq(logt:mean(), mean, tolerance, "mean is wrong")
-   tester:assertalmosteq(logt:std(), std, tolerance, "standard deviation is wrong")
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(typename)
+       x:logNormal(mean, std)
+       local logt = x:log()
+       tester:assertalmosteq(logt:mean(), mean, tolerance, "mean is wrong")
+       tester:assertalmosteq(logt:std(), std, tolerance, "standard deviation is wrong")
+   end
    checkMultiDevice(t, 'logNormal', mean, std)
 end
 
@@ -2580,10 +2594,14 @@ function test.geometric()
    local p = torch.uniform()
    local t = torch.CudaTensor(sz1, sz2)
 
-   t:geometric(p)
-   local u = torch.FloatTensor(sz1, sz2):fill(1) -
-                 ((t:float() - 1) * math.log(p)):exp()
-   checkIfUniformlyDistributed(u, 0, 1)
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(typename)
+       x:geometric(p)
+
+       local u = torch.FloatTensor(sz1, sz2):fill(1) -
+                     ((x:float() - 1) * math.log(p)):exp()
+       checkIfUniformlyDistributed(u, 0, 1)
+   end
    checkMultiDevice(t, 'geometric', p)
 end
 
@@ -2593,10 +2611,13 @@ function test.exponential()
    local lambda = torch.uniform()
    local t = torch.CudaTensor(sz1, sz2)
 
-   t:exponential(lambda)
-   local u = torch.FloatTensor(sz1, sz2):fill(1) -
-                 (t:float() * -lambda):exp()
-   checkIfUniformlyDistributed(u, 0, 1)
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(t2cpu[typename])
+       x:exponential(lambda)
+       local u = torch.FloatTensor(sz1, sz2):fill(1) -
+                     (x:float() * -lambda):exp()
+       checkIfUniformlyDistributed(u, 0, 1)
+   end
    checkMultiDevice(t, 'exponential', lambda)
 end
 
@@ -2606,9 +2627,12 @@ function test.cauchy()
    local median, sigma = torch.uniform(), torch.uniform()
    local t = torch.CudaTensor(sz1, sz2)
 
-   t:cauchy(median, sigma)
-   local u = ((t:float() - median) / sigma):atan() / math.pi + 0.5
-   checkIfUniformlyDistributed(u, 0, 1)
+   for _, typename in ipairs(float_typenames) do
+       local x = t:type(typename)
+       x:cauchy(median, sigma)
+       local u = ((x:float() - median) / sigma):atan() / math.pi + 0.5
+       checkIfUniformlyDistributed(u, 0, 1)
+   end
    checkMultiDevice(t, 'cauchy', median, sigma)
 end
 
@@ -2675,16 +2699,19 @@ function test.multinomial_with_replacement()
       local prob_dist = torch.CudaTensor(n_row, n_col):uniform()
       prob_dist:select(2, n_col):fill(0) --index n_col shouldn't be sampled
       local n_sample = torch.random(n_col - 1)
-      local sample_indices = torch.multinomial(prob_dist, n_sample, true)
-      tester:assert(sample_indices:dim() == 2, "wrong sample_indices dim")
-      tester:assert(sample_indices:size(2) == n_sample, "wrong number of samples")
-
-      for i = 1, n_row do
-         for j = 1, n_sample do
-            local val = sample_indices[{i,j}]
-            tester:assert(val == math.floor(val) and val >= 1 and val < n_col,
-                          "sampled an invalid index: " .. val)
-         end
+      for _, typename in ipairs(float_typenames) do
+          local pd = prob_dist:type(typename)
+          local sample_indices = torch.multinomial(pd, n_sample, true)
+          tester:assert(sample_indices:dim() == 2, "wrong sample_indices dim")
+          tester:assert(sample_indices:size(2) == n_sample, "wrong number of samples")
+
+          for i = 1, n_row do
+             for j = 1, n_sample do
+                local val = sample_indices[{i,j}]
+                tester:assert(val == math.floor(val) and val >= 1 and val < n_col,
+                              "sampled an invalid index: " .. val)
+             end
+          end
       end
    end
 end
@@ -2698,24 +2725,27 @@ function test.multinomial_without_replacement()
       local prob_dist = torch.CudaTensor(n_row, n_col):uniform()
       prob_dist:select(2, n_col):fill(0) --index n_col shouldn't be sampled
       local n_sample = torch.random(n_col - 1)
-      local sample_indices = torch.multinomial(prob_dist, n_sample, false)
-      tester:assert(sample_indices:dim() == 2, "wrong sample_indices dim")
-      tester:assert(sample_indices:size(2) == n_sample, "wrong number of samples")
-
-      sample_indices = sample_indices:float()
-
-      for i = 1, n_row do
-         local row_samples = {}
-         for j = 1, n_sample do
-            local sample_idx = sample_indices[{i,j}]
-            tester:assert(
-               sample_idx ~= n_col, "sampled an index with zero probability"
-            )
-            tester:assert(
-                  not row_samples[sample_idx], "sampled an index twice"
-            )
-            row_samples[sample_idx] = true
-         end
+      for _, typename in ipairs(float_typenames) do
+          local pd = prob_dist:type(typename)
+          local sample_indices = torch.multinomial(pd, n_sample, false)
+          tester:assert(sample_indices:dim() == 2, "wrong sample_indices dim")
+          tester:assert(sample_indices:size(2) == n_sample, "wrong number of samples")
+
+          sample_indices = sample_indices:float()
+
+          for i = 1, n_row do
+             local row_samples = {}
+             for j = 1, n_sample do
+                local sample_idx = sample_indices[{i,j}]
+                tester:assert(
+                   sample_idx ~= n_col, "sampled an index with zero probability"
+                )
+                tester:assert(
+                      not row_samples[sample_idx], "sampled an index twice"
+                )
+                row_samples[sample_idx] = true
+             end
+          end
       end
    end
 end
@@ -2731,17 +2761,21 @@ function test.multinomial_without_replacement_gets_all()
          t[dist] = linear
       end
 
-      local orig = t:clone()
+      local orig = t:clone():long()
+
+      for _, typename in ipairs(float_typenames) do
+          local x = t:type(typename)
 
-      -- Sample without replacement
-      local result = torch.multinomial(t, distSize)
-      tester:assert(result:size(1) == distributions)
-      tester:assert(result:size(2) == distSize)
+          -- Sample without replacement
+          local result = torch.multinomial(x, distSize)
+          tester:assert(result:size(1) == distributions)
+          tester:assert(result:size(2) == distSize)
 
-      -- Sort, and we should have the original results, since without replacement
-      -- sampling everything, we should have chosen every value uniquely
-      result = result:sort(2)
-      tester:assertTensorEq(orig, result, 0, "error in multinomial_without_replacement_gets_all")
+          -- Sort, and we should have the original results, since without replacement
+          -- sampling everything, we should have chosen every value uniquely
+          result = result:sort(2)
+          tester:assertTensorEq(orig:type(typename), result, 0, "error in multinomial_without_replacement_gets_all")
+      end
    end
 end
 
@@ -2749,12 +2783,15 @@ function test.multinomial_vector()
    local n_col = torch.random(100)
    local prob_dist = torch.CudaTensor(n_col):uniform()
    local n_sample = n_col
-   local sample_indices = torch.multinomial(prob_dist, n_sample, true)
-   tester:assert(sample_indices:dim() == 1, "wrong sample_indices dim")
-   -- Multinomial resizes prob_dist to be 2d (1xn), check that the resize
-   -- was undone
-   tester:assert(prob_dist:dim() == 1, "wrong number of prob_dist dimensions")
-   tester:assert(sample_indices:size(1) == n_sample, "wrong number of samples")
+   for _, typename in ipairs(float_typenames) do
+       local pd = prob_dist:type(typename)
+       local sample_indices = torch.multinomial(pd, n_sample, true)
+       tester:assert(sample_indices:dim() == 1, "wrong sample_indices dim")
+       -- Multinomial resizes prob_dist to be 2d (1xn), check that the resize
+       -- was undone
+       tester:assert(prob_dist:dim() == 1, "wrong number of prob_dist dimensions")
+       tester:assert(sample_indices:size(1) == n_sample, "wrong number of samples")
+   end
 end
 
 function test.get_device()
author	Soumith Chintala <soumith@gmail.com>	2016-11-12 01:56:39 +0300
committer	GitHub <noreply@github.com>	2016-11-12 01:56:39 +0300
commit	71df98b3cbe84ce943e492732f036dd5526ed329 (patch)
tree	ce3451f81a0e6a313c91d4ce96a9a80c3057be44
parent	751f6192eb8e58d1cf3c9ff52c7f38017e73b72d (diff)
parent	6bb181eec14eb52a11321d3424c6e4b9de93861f (diff)