Merge remote-tracking branch 'upstream/master' into more-generic-functions

author: Trevor Killeen <killeentm@gmail.com> 2016-10-10 20:09:43 +0300
committer: Trevor Killeen <killeentm@gmail.com> 2016-10-10 20:09:43 +0300
commit: e4ebbd3a2d668ae51d49d9deaf2c62fc6de81a8e (patch)
tree: c0c3cbf24139a1504ff232476dd7b64987605821
parent: 7860a76e1cc50e5c679a965c95cdca2501cac9bc (diff)
parent: 9efd392d4974e3fe3b1469809971d5d4f360ec7b (diff)
43 files changed, 404 insertions, 338 deletions
diff --git a/FFI.lua b/FFI.lua
index bd3ad1c..9417deb 100644
--- a/FFI.lua
+++ b/FFI.lua
@@ -50,7 +50,7 @@ cudaStream_t THCState_getCurrentStream(THCState *state);
 typedef struct THCStorage
 {
     real *data;
-    long size;
+    ptrdiff_t size;
     int refcount;
     char flag;
     THAllocator *allocator;
@@ -65,7 +65,7 @@ typedef struct THCTensor
     int nDimension;
 
     THCStorage *storage;
-    long storageOffset;
+    ptrdiff_t storageOffset;
     int refcount;
 
     char flag;
diff --git a/TensorMath.lua b/TensorMath.lua
index abdda0b..e917f8c 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -2,6 +2,58 @@ local wrap = require 'cwrap'
 
 local interface = wrap.CInterface.new()
 local method = wrap.CInterface.new()
+local argtypes = wrap.CInterface.argtypes
+
+argtypes['ptrdiff_t'] = {
+
+  helpname = function(arg)
+                return 'ptrdiff_t'
+             end,
+
+  declare = function(arg)
+               -- if it is a number we initialize here
+               local default = tonumber(tostring(arg.default)) or 0
+               return string.format("%s arg%d = %g;", 'ptrdiff_t', arg.i, default)
+            end,
+
+  check = function(arg, idx)
+             return string.format("lua_isinteger(L, %d)", idx)
+          end,
+
+  read = function(arg, idx)
+            return string.format("arg%d = (%s)lua_tointeger(L, %d);", arg.i, 'ptrdiff_t', idx)
+         end,
+
+  init = function(arg)
+            -- otherwise do it here
+            if arg.default then
+               local default = tostring(arg.default)
+               if not tonumber(default) then
+                  return string.format("arg%d = %s;", arg.i, default)
+               end
+            end
+         end,
+
+  carg = function(arg)
+            return string.format('arg%d', arg.i)
+         end,
+
+  creturn = function(arg)
+               return string.format('arg%d', arg.i)
+            end,
+
+  precall = function(arg)
+               if arg.returned then
+                  return string.format('lua_pushinteger(L, (lua_Integer)arg%d);', arg.i)
+               end
+            end,
+
+  postcall = function(arg)
+                if arg.creturned then
+                   return string.format('lua_pushinteger(L, (lua_Integer)arg%d);', arg.i)
+                end
+             end
+}
 
 interface:print('/* WARNING: autogenerated file */')
 interface:print('')
@@ -559,7 +611,7 @@ for k, Tensor_ in pairs(handledTypenames) do
     wrap("numel",
          cname("numel"),
          {{name=Tensor},
-            {name="long", creturned=true}})
+            {name="ptrdiff_t", creturned=true}})
 
     wrap("add",
          cname("add"),
diff --git a/generic/CTensor.c b/generic/CTensor.c
index 5569c02..a9663ff 100644
--- a/generic/CTensor.c
+++ b/generic/CTensor.c
@@ -173,16 +173,16 @@ void THFloatTensor_kernel_copy(float *dst,
                                          long *dst_sz, long *dst_st, int dst_dim,
                                          float *src,
                                          long *src_sz, long *src_st, int src_dim,
-                                         long n_elem)
+                                         ptrdiff_t n_elem)
 {
-  long k;
+  ptrdiff_t k;
 
   for(k = 0; k < n_elem; k++)
   {
-    long src_idx = 0;
-    long src_rest = k;
-    long dst_idx = 0;
-    long dst_rest = k;
+    ptrdiff_t src_idx = 0;
+    ptrdiff_t src_rest = k;
+    ptrdiff_t dst_idx = 0;
+    ptrdiff_t dst_rest = k;
     int dim;
 
     for(dim = 0; dim < dst_dim; dim++)
@@ -206,7 +206,7 @@ static int cuda_FloatTensor_fakecopy(lua_State *L)
   THFloatTensor *self = luaT_checkudata(L, 1, "torch.FloatTensor");
   THFloatTensor *src = luaT_checkudata(L, 2, "torch.FloatTensor");
   long *d_self_sz, *d_self_st, *d_src_sz, *d_src_st;
-  long nElement = THFloatTensor_nElement(self);
+  ptrdiff_t nElement = THFloatTensor_nElement(self);
 
   THArgCheck(THFloatTensor_nElement(self) == THFloatTensor_nElement(src), 2, "sizes do not match");
 
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index 181bc9d..b9ddfbe 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -211,6 +211,7 @@ INSTALL(FILES
           THCReduceAll.cuh
           THCReduceApplyUtils.cuh
           THCAsmUtils.cuh
+          THCAtomics.cuh
           THCScanUtils.cuh
           THCSortUtils.cuh
           THCAllocator.h
diff --git a/lib/THC/THCAllocator.c b/lib/THC/THCAllocator.c
index e2cd09a..1bed0fb 100644
--- a/lib/THC/THCAllocator.c
+++ b/lib/THC/THCAllocator.c
@@ -1,6 +1,6 @@
 #include "THCAllocator.h"
 
-static void *THCudaHostAllocator_alloc(void* ctx, long size) {
+static void *THCudaHostAllocator_alloc(void* ctx, ptrdiff_t size) {
   void* ptr;
 
   if (size < 0) THError("Invalid memory size: %ld", size);
@@ -18,7 +18,7 @@ static void THCudaHostAllocator_free(void* ctx, void* ptr) {
   THCudaCheck(cudaFreeHost(ptr));
 }
 
-static void *THCudaHostAllocator_realloc(void* ctx, void* ptr, long size) {
+static void *THCudaHostAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) {
   if (size < 0) THError("Invalid memory size: %ld", size);
 
   THCudaHostAllocator_free(ctx, ptr);
diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh
index dd6d32a..a47e303 100644
--- a/lib/THC/THCApply.cuh
+++ b/lib/THC/THCApply.cuh
@@ -101,7 +101,7 @@ inline dim3 getApplyBlock() {
   return dim3(THC_APPLY_THREADS_PER_BLOCK);
 }
 
-inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) {
+inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
   int curDevice = -1;
   cudaGetDevice(&curDevice);
 
@@ -116,7 +116,7 @@ inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) {
   // 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
   // which seems to be a good sweetspot for latency hiding
   grid = dim3(min((long long) THCCeilDiv(totalElements,
-                                         (long) THC_APPLY_THREADS_PER_BLOCK),
+                                         (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK),
                   4LL * numSM));
   return true;
 }
@@ -139,7 +139,7 @@ bool THC_pointwiseApply1(THCState* state,
   const dim3 block = getApplyBlock();
 
   dim3 grid;
-  long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
 
   if (!getApplyGrid(state, totalElements, grid)) {
     return false;
@@ -253,7 +253,7 @@ bool THC_pointwiseApply2(THCState* state,
                          const Op& op,
                          TensorArgType aType = ReadWrite,
                          TensorArgType bType = ReadOnly) {
-  long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
 
   if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) {
     return false;
@@ -431,7 +431,7 @@ bool THC_pointwiseApply3(THCState* state,
                          TensorArgType aType = ReadWrite,
                          TensorArgType bType = ReadOnly,
                          TensorArgType cType = ReadOnly) {
-  long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
+  ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a);
 
   if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) ||
       totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) {
diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh
new file mode 100644
index 0000000..0586edf
--- /dev/null
+++ b/lib/THC/THCAtomics.cuh
@@ -0,0 +1,131 @@
+#ifndef THC_ATOMICS_INC
+#define THC_ATOMICS_INC
+
+#include "THCHalf.h"
+
+template <typename T, size_t n>
+struct AtomicAddIntegerImpl;
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 1> {
+  __device__ void operator()(T *address, T val) {
+    unsigned int * address_as_ui =
+        (unsigned int *) (address - ((size_t)address & 3));
+    unsigned int old = *address_as_ui;
+    unsigned int shift = (((size_t)address & 3) * 8);
+    unsigned int sum;
+    unsigned int assumed;
+
+    do {
+      assumed = old;
+      sum = val + T((old >> shift) & 0xff);
+      old = (old & ~(0x000000ff << shift)) | (sum << shift);
+      old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+  }
+};
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 2> {
+  __device__ void operator()(T *address, T val) {
+    unsigned int * address_as_ui =
+        (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int sum;
+    unsigned int newval;
+    unsigned int assumed;
+
+    do {
+      assumed = old;
+      sum = val + (size_t)address & 2 ? T(old >> 16) : T(old & 0xffff);
+      newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16) : (old & 0xffff0000) | sum;
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+  }
+};
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 4> {
+  __device__ void operator()(T *address, T val) {
+    unsigned int * address_as_ui = (unsigned int *) (address);
+    unsigned int old = *address_as_ui;
+    unsigned int newval;
+    unsigned int assumed;
+
+    do {
+      assumed = old;
+      newval = val +  (T)old;
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+  }
+};
+
+template<typename T>
+struct AtomicAddIntegerImpl<T, 8> {
+  __device__ void operator()(T *address, T val) {
+    unsigned long long * address_as_ui = (unsigned long long *) (address);
+    unsigned long long old = *address_as_ui;
+    unsigned long long newval;
+    unsigned long long assumed;
+
+    do {
+      assumed = old;
+      newval = val +  (T)old;
+      old = atomicCAS(address_as_ui, assumed, newval);
+    } while (assumed != old);
+  }
+};
+
+__device__ void atomicAdd(unsigned char *address, unsigned char val) {
+  AtomicAddIntegerImpl<unsigned char, sizeof(unsigned char)>()(address, val);
+}
+
+__device__ void atomicAdd(char *address, char val) {
+  AtomicAddIntegerImpl<char, sizeof(char)>()(address, val);
+}
+
+__device__ void atomicAdd(short *address, short val) {
+  AtomicAddIntegerImpl<short, sizeof(short)>()(address, val);
+}
+
+__device__ void atomicAdd(long *address, long val) {
+  AtomicAddIntegerImpl<long, sizeof(long)>()(address, val);
+}
+
+#ifdef CUDA_HALF_TENSOR
+__device__ void atomicAdd(half *address, half val) {
+  unsigned int * address_as_ui =
+      (unsigned int *) ((char *)address - ((size_t)address & 2));
+  unsigned int old = *address_as_ui;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    half hsum;
+    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    hsum = THCNumerics<half>::add(hsum, val);
+    old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+    old = atomicCAS(address_as_ui, assumed, old);
+   } while (assumed != old);
+}
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+// from CUDA C Programmic Guide
+__device__  void atomicAdd(double *address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull;
+  unsigned long long int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val +
+                    __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+}
+#endif
+
+#endif // THC_ATOMICS_INC
diff --git a/lib/THC/THCDeviceTensor-inl.cuh b/lib/THC/THCDeviceTensor-inl.cuh
index a0058ad..9237d07 100644
--- a/lib/THC/THCDeviceTensor-inl.cuh
+++ b/lib/THC/THCDeviceTensor-inl.cuh
@@ -103,9 +103,9 @@ THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
 
 template <typename T, int Dim,
           typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ long
+__host__ __device__ ptrdiff_t
 THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
-  long size = getSize(0);
+  ptrdiff_t size = getSize(0);
 
   for (int i = 1; i < Dim; ++i) {
     size *= getSize(i);
diff --git a/lib/THC/THCDeviceTensor.cuh b/lib/THC/THCDeviceTensor.cuh
index 14d56c6..c6b7899 100644
--- a/lib/THC/THCDeviceTensor.cuh
+++ b/lib/THC/THCDeviceTensor.cuh
@@ -142,7 +142,7 @@ class THCDeviceTensor {
 
   /// Returns the total number of elements contained within our data
   /// (product of `getSize(i)`)
-  __host__ __device__ long numElements() const;
+  __host__ __device__ ptrdiff_t numElements() const;
 
   /// Returns the size array.
   __host__ __device__ __forceinline__ const IndexT* sizes() const {
diff --git a/lib/THC/THCDeviceTensorUtils-inl.cuh b/lib/THC/THCDeviceTensorUtils-inl.cuh
index f954772..26c1bb8 100644
--- a/lib/THC/THCDeviceTensorUtils-inl.cuh
+++ b/lib/THC/THCDeviceTensorUtils-inl.cuh
@@ -10,7 +10,7 @@ toDeviceTensor(THCState* state, THCudaTensor* t) {
 
   // Determine the maximum offset into the tensor achievable; `IndexT`
   // must be smaller than this type in order to use it.
-  long maxOffset = 0;
+  ptrdiff_t maxOffset = 0;
   IndexT sizes[Dim];
   IndexT strides[Dim];
 
diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c
index 07dbf2c..5bcce19 100644
--- a/lib/THC/THCGeneral.c
+++ b/lib/THC/THCGeneral.c
@@ -60,8 +60,8 @@ struct THCState {
 
   void (*cutorchGCFunction)(void *data);
   void *cutorchGCData;
-  long heapSoftmax;
-  long heapDelta;
+  ptrdiff_t heapSoftmax;
+  ptrdiff_t heapDelta;
 };
 
 THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
@@ -640,8 +640,9 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
   }
 }
 
-static long heapSize = 0; // not thread-local
-static const long heapMaxDelta = 1e6;
+static ptrdiff_t heapSize = 0; // not thread-local
+static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6;
+static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6;
 static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC
 static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40%
 
@@ -691,8 +692,8 @@ cudaError_t THCudaFree(THCState *state, void *ptr)
   return allocator->free(allocator->state, ptr);
 }
 
-static long applyHeapDelta(THCState *state) {
-  long newHeapSize = THAtomicAddLong(&heapSize, state->heapDelta) + state->heapDelta;
+static ptrdiff_t applyHeapDelta(THCState *state) {
+  ptrdiff_t newHeapSize = THAtomicAddPtrdiff(&heapSize, state->heapDelta) + state->heapDelta;
   state->heapDelta = 0;
   return newHeapSize;
 }
@@ -701,27 +702,27 @@ static long applyHeapDelta(THCState *state) {
 // When THC heap size goes above this softmax, the GC hook is triggered.
 // If heap size is above 80% of the softmax after GC, then the softmax is
 // increased.
-static void maybeTriggerGC(THCState *state, long curHeapSize) {
+static void maybeTriggerGC(THCState *state, ptrdiff_t curHeapSize) {
   if (state->cutorchGCFunction != NULL && curHeapSize > state->heapSoftmax) {
     (state->cutorchGCFunction)(state->cutorchGCData);
 
     // ensure heapSize is accurate before updating heapSoftmax
-    long newHeapSize = applyHeapDelta(state);
+    ptrdiff_t newHeapSize = applyHeapDelta(state);
 
     if (newHeapSize > state->heapSoftmax * heapSoftmaxGrowthThresh) {
-      state->heapSoftmax = state->heapSoftmax * heapSoftmaxGrowthFactor;
+      state->heapSoftmax = (ptrdiff_t)state->heapSoftmax * heapSoftmaxGrowthFactor;
     }
   }
 }
 
-void THCHeapUpdate(THCState *state, long size) {
+void THCHeapUpdate(THCState *state, ptrdiff_t size) {
   state->heapDelta += size;
   // batch updates to global heapSize to minimize thread contention
-  if (labs(state->heapDelta) < heapMaxDelta) {
+  if (state->heapDelta < heapMaxDelta && state->heapDelta > heapMinDelta) {
     return;
   }
 
-  long newHeapSize = applyHeapDelta(state);
+  ptrdiff_t newHeapSize = applyHeapDelta(state);
   if (size > 0) {
     maybeTriggerGC(state, newHeapSize);
   }
diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in
index 8837f42..ce3ffc5 100644
--- a/lib/THC/THCGeneral.h.in
+++ b/lib/THC/THCGeneral.h.in
@@ -114,6 +114,6 @@ THC_API cudaError_t THCudaFree(THCState *state, void *ptr);
 THC_API void THCSetGCHandler(THCState *state,
                              void (*torchGCHandlerFunction)(void *data),
                              void *data );
-THC_API void THCHeapUpdate(THCState *state, long size);
+THC_API void THCHeapUpdate(THCState *state, ptrdiff_t size);
 
 #endif
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index 7777bf7..5a77293 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -10,7 +10,7 @@ struct __float2halfOp {
   __device__ half operator()(float v) { return __float2half(v); }
 };
 
-void THCFloat2Half(THCState *state, half *out, float *in, long len) {
+void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len) {
   thrust::transform(
 #if CUDA_VERSION >= 7000
     thrust::cuda::par.on(THCState_getCurrentStream(state)),
@@ -20,7 +20,7 @@ void THCFloat2Half(THCState *state, half *out, float *in, long len) {
     in, in + len, out, __float2halfOp());
 }
 
-void THCHalf2Float(THCState *state, float *out, half *in, long len) {
+void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) {
   thrust::transform(
 #if CUDA_VERSION >= 7000
     thrust::cuda::par.on(THCState_getCurrentStream(state)),
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index ec6d83f..795874e 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -18,8 +18,8 @@
 #include <cuda_fp16.h>
 #include <stdint.h>
 
-THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, long len);
-THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
+THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
+THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len);
 THC_API half THC_float2half(float a);
 THC_API float THC_half2float(half a);
 
diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh
index a7135de..7f276a2 100644
--- a/lib/THC/THCReduce.cuh
+++ b/lib/THC/THCReduce.cuh
@@ -123,7 +123,7 @@ inline dim3 getNoncontigReduceBlock() {
   return dim3(THC_NONCONTIG_REDUCE_BLOCK_SIZE);
 }
 
-inline dim3 getContigReduceBlock(long numSlices, long reductionSize) {
+inline dim3 getContigReduceBlock(ptrdiff_t numSlices, long reductionSize) {
   // If the number of slices is low but the reduction dimension size
   // is high, then we should increase block size for greater parallelism.
   // Aim for at least 32 warps per SM (assume 15 SMs; don't bother
@@ -148,13 +148,13 @@ inline dim3 getContigReduceBlock(long numSlices, long reductionSize) {
   return dim3(numWarps * 32);
 }
 
-inline bool getNoncontigReduceGrid(long elements, dim3& grid) {
+inline bool getNoncontigReduceGrid(ptrdiff_t elements, dim3& grid) {
   // One output point per thread
   return THC_getGridFromTiles(THCCeilDiv(elements,
-                                         (long) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
+                                         (ptrdiff_t) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
 }
 
-inline bool getContigReduceGrid(long elements, dim3& grid) {
+inline bool getContigReduceGrid(ptrdiff_t elements, dim3& grid) {
   // One output point per block
   return THC_getGridFromTiles(elements, grid);
 }
@@ -169,11 +169,11 @@ bool THC_reduceDim(THCState* state,
                    const ReduceOp& reduceOp,
                    typename TensorUtils<TensorType>::DataType init,
                    int dim) {
-  long inElements = TensorUtils<TensorType>::getNumElements(state, in);
+  ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);
 
   long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
   long reductionStride = TensorUtils<TensorType>::getStride(state, in, dim);
-  long outElements = inElements / reductionSize;
+  ptrdiff_t outElements = inElements / reductionSize;
 
   if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS ||
       TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh
index 498fb53..a9cea84 100644
--- a/lib/THC/THCReduceAll.cuh
+++ b/lib/THC/THCReduceAll.cuh
@@ -124,16 +124,16 @@ kernelReduceAllPass2(int numPass1Blocks,
 
 // Perform a two-pass reduction if the tensor is large enough to
 // warrant it.
-inline bool isTwoPassReductionSize(long elements) {
+inline bool isTwoPassReductionSize(ptrdiff_t elements) {
   return (elements > THC_TWO_PASS_REDUCTION_SIZE);
 }
 
 template <typename InT, typename AccT>
-inline long getTwoPassBlocks(THCState* state, long elements) {
-  long numBlocks = THCCeilDiv(elements, THC_REDUCE_ALL_BLOCK_SIZE);
+inline ptrdiff_t getTwoPassBlocks(THCState* state, ptrdiff_t elements) {
+  ptrdiff_t numBlocks = THCCeilDiv(elements, (ptrdiff_t)THC_REDUCE_ALL_BLOCK_SIZE);
 
   // We can only have as many blocks as there is scratch space
-  long scratchSpace =
+  ptrdiff_t scratchSpace =
     THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(AccT);
   THAssert(scratchSpace > 0);
 
@@ -146,14 +146,14 @@ inline long getTwoPassBlocks(THCState* state, long elements) {
 
 // Get the block/grid size that we want
 template <typename InT, typename AccT>
-inline void getPass1ReduceBlockGrid(THCState* state, long elements,
+inline void getPass1ReduceBlockGrid(THCState* state, ptrdiff_t elements,
                                     dim3& grid, dim3& block) {
   grid = dim3(getTwoPassBlocks<InT, AccT>(state, elements));
   block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
 }
 
 template <typename InT, typename AccT>
-inline void getPass2ReduceBlockGrid(THCState* state, long elements,
+inline void getPass2ReduceBlockGrid(THCState* state, ptrdiff_t elements,
                                     dim3& grid, dim3& block) {
   grid = dim3(1);
   // We only need as many threads as there were blocks originally
@@ -161,7 +161,7 @@ inline void getPass2ReduceBlockGrid(THCState* state, long elements,
 }
 
 template <typename InT, typename AccT>
-inline void getSinglePassReduceBlockGrid(long elements,
+inline void getSinglePassReduceBlockGrid(ptrdiff_t elements,
                                          dim3& grid, dim3& block) {
   grid = dim3(1);
   block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
@@ -176,7 +176,7 @@ template <typename ModifyOp,
           int ADims>
 void callReduceAll(THCState* state,
                    const TensorInfo<InT, IndexType>& in,
-                   long totalElements,
+                   ptrdiff_t totalElements,
                    AccT init,
                    const ModifyOp& modifyOp,
                    const ReduceOp& reduceOp,
@@ -229,7 +229,7 @@ bool THC_reduceAll(THCState* state,
                    AccT init,
                    AccT* out,
                    int outOnDevice) {
-  long inElements = TensorUtils<TensorType>::getNumElements(state, in);
+  ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);
 
   if (TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
     return false;
diff --git a/lib/THC/THCReduceApplyUtils.cu b/lib/THC/THCReduceApplyUtils.cu
index b3c6f51..6d4c06e 100644
--- a/lib/THC/THCReduceApplyUtils.cu
+++ b/lib/THC/THCReduceApplyUtils.cu
@@ -11,7 +11,7 @@ void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg) {
   THArgCheck(dims <= MAX_CUTORCH_DIMS, arg, CUTORCH_DIM_WARNING);
 }
 
-bool THC_getGridFromTiles(long gridTiles, dim3& grid) {
+bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid) {
   if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) {
     return false;
   }
@@ -21,11 +21,11 @@ bool THC_getGridFromTiles(long gridTiles, dim3& grid) {
   long gridZ = 1;
 
   if (gridTiles > MAX_GRID_SIZE) {
-    gridTiles = THCCeilDiv(gridTiles, (long) MAX_GRID_SIZE);
+    gridTiles = THCCeilDiv(gridTiles, (ptrdiff_t) MAX_GRID_SIZE);
     gridY = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
 
     if (gridTiles > MAX_GRID_SIZE) {
-      gridTiles = THCCeilDiv(gridTiles, (long) MAX_GRID_SIZE);
+      gridTiles = THCCeilDiv(gridTiles, (ptrdiff_t) MAX_GRID_SIZE);
       gridZ = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
     }
   }
diff --git a/lib/THC/THCReduceApplyUtils.cuh b/lib/THC/THCReduceApplyUtils.cuh
index 06f969f..e365b3a 100644
--- a/lib/THC/THCReduceApplyUtils.cuh
+++ b/lib/THC/THCReduceApplyUtils.cuh
@@ -76,6 +76,6 @@ __device__ T reduceBlock(T* smem,
 void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg);
 
 // Produces a grid with at least one point per tile
-THC_API bool THC_getGridFromTiles(long gridTiles, dim3& grid);
+THC_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid);
 
 #endif // THC_REDUCE_APPLY_UTILS_INC
diff --git a/lib/THC/THCTensorConv.cu b/lib/THC/THCTensorConv.cu
index 2543d26..71aac03 100644
--- a/lib/THC/THCTensorConv.cu
+++ b/lib/THC/THCTensorConv.cu
@@ -354,7 +354,7 @@ THC_API void THCudaTensor_conv2Dmv(THCState *state, THCudaTensor *output, float
     nOutputCols = (nInputCols - nKernelCols) / scol + 1;
   }
 
-  long nelem = THCudaTensor_nElement(state, output);
+  ptrdiff_t nelem = THCudaTensor_nElement(state, output);
   THCudaTensor_resize3d(state, output, nOutputPlane, nOutputRows, nOutputCols);
 
   if (beta == 0 || nelem != THCudaTensor_nElement(state, output)) {
@@ -475,7 +475,7 @@ THC_API void THCudaTensor_conv2Dmm(THCState *state, THCudaTensor *output, float
     nOutputCols = (nInputCols - nKernelCols) / scol + 1;
   }
 
-  long nelem = THCudaTensor_nElement(state, output);
+  ptrdiff_t nelem = THCudaTensor_nElement(state, output);
   THCudaTensor_resize4d(state, output, nbatch, nOutputPlane, nOutputRows, nOutputCols);
 
   if (beta == 0 || nelem != THCudaTensor_nElement(state, output)) {
@@ -576,7 +576,7 @@ THC_API void THCudaTensor_conv2DRevger(THCState *state, THCudaTensor *output, fl
   nOutputRows = nInputRows - (nKernelRows - 1) * srow;
   nOutputCols = nInputCols - (nKernelCols - 1) * scol;
 
-  long nelem = THCudaTensor_nElement(state, output);
+  ptrdiff_t nelem = THCudaTensor_nElement(state, output);
   THCudaTensor_resize4d(state, output, nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
 
   if (nelem == 0 || beta == 0 || nelem != THCudaTensor_nElement(state, output)) {
@@ -649,7 +649,7 @@ THC_API void THCudaTensor_conv2DRevgerm(THCState *state, THCudaTensor *output, f
   nOutputRows = nInputRows - (nKernelRows - 1) * srow;
   nOutputCols = nInputCols - (nKernelCols - 1) * scol;
 
-  long nelem = THCudaTensor_nElement(state, output);
+  ptrdiff_t nelem = THCudaTensor_nElement(state, output);
   THCudaTensor_resize4d(state, output, nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
 
   if (nelem == 0 || beta == 0 || nelem != THCudaTensor_nElement(state, output)) {
@@ -912,7 +912,7 @@ THC_API void THCudaTensor_conv2Dmap(THCState *state, THCudaTensor *output, THCud
   nOutputRows = (nInputRows - nKernelRows) / stride_y + 1;
   nOutputCols = (nInputCols - nKernelCols) / stride_x + 1;
 
-  // long nelem = THCudaTensor_nElement(state, output);
+  // ptrdiff_t nelem = THCudaTensor_nElement(state, output);
   THCudaTensor_resize3d(state, output, nOutputPlane, nOutputRows, nOutputCols);
 
   float *input_data = THCudaTensor_data(state, input);
diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu
index d5aae4f..b0001c5 100644
--- a/lib/THC/THCTensorCopy.cu
+++ b/lib/THC/THCTensorCopy.cu
@@ -24,7 +24,7 @@ struct CopyOp {
 template <typename TensorTypeDst, typename TensorTypeSrc>
 void
 THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) {
-  long totalElements = TensorUtils<TensorTypeDst>::getNumElements(state, dst);
+  ptrdiff_t totalElements = TensorUtils<TensorTypeDst>::getNumElements(state, dst);
 
   THArgCheck(totalElements ==
              TensorUtils<TensorTypeSrc>::getNumElements(state, src),
diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu
index 8bf1a88..415e625 100644
--- a/lib/THC/THCTensorIndex.cu
+++ b/lib/THC/THCTensorIndex.cu
@@ -9,6 +9,7 @@
 #include "THCReduce.cuh"
 #include "THCDeviceUtils.cuh"
 #include "THCNumerics.cuh"
+#include "THCAtomics.cuh"
 #include <algorithm> // for std::min
 
 // We prefer this kernel to avoid reloading index points if the number
@@ -96,131 +97,6 @@ __global__ void indexCopyLargeIndex(TensorInfo<T, IndexType> dst,
   }
 }
 
-template <typename T, size_t n>
-struct AtomicAddIntegerImpl;
-
-template<typename T>
-struct AtomicAddIntegerImpl<T, 1> {
-  __device__ void operator()(T *address, T val) {
-    unsigned int * address_as_ui =
-        (unsigned int *) (address - ((size_t)address & 3));
-    unsigned int old = *address_as_ui;
-    unsigned int shift = (((size_t)address & 3) * 8);
-    unsigned int sum;
-    unsigned int assumed;
-
-    do {
-      assumed = old;
-      sum = val + T((old >> shift) & 0xff);
-      old = (old & ~(0x000000ff << shift)) | (sum << shift);
-      old = atomicCAS(address_as_ui, assumed, old);
-    } while (assumed != old);
-  }
-};
-
-template<typename T>
-struct AtomicAddIntegerImpl<T, 2> {
-  __device__ void operator()(T *address, T val) {
-    unsigned int * address_as_ui =
-        (unsigned int *) ((char *)address - ((size_t)address & 2));
-    unsigned int old = *address_as_ui;
-    unsigned int sum;
-    unsigned int newval;
-    unsigned int assumed;
-
-    do {
-      assumed = old;
-      sum = val + (size_t)address & 2 ? T(old >> 16) : T(old & 0xffff);
-      newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16) : (old & 0xffff0000) | sum;
-      old = atomicCAS(address_as_ui, assumed, newval);
-    } while (assumed != old);
-  }
-};
-
-template<typename T>
-struct AtomicAddIntegerImpl<T, 4> {
-  __device__ void operator()(T *address, T val) {
-    unsigned int * address_as_ui = (unsigned int *) (address);
-    unsigned int old = *address_as_ui;
-    unsigned int newval;
-    unsigned int assumed;
-
-    do {
-      assumed = old;
-      newval = val +  (T)old;
-      old = atomicCAS(address_as_ui, assumed, newval);
-    } while (assumed != old);
-  }
-};
-
-template<typename T>
-struct AtomicAddIntegerImpl<T, 8> {
-  __device__ void operator()(T *address, T val) {
-    unsigned long long * address_as_ui = (unsigned long long *) (address);
-    unsigned long long old = *address_as_ui;
-    unsigned long long newval;
-    unsigned long long assumed;
-
-    do {
-      assumed = old;
-      newval = val +  (T)old;
-      old = atomicCAS(address_as_ui, assumed, newval);
-    } while (assumed != old);
-  }
-};
-
-__device__ void atomicAdd(unsigned char *address, unsigned char val) {
-  AtomicAddIntegerImpl<unsigned char, sizeof(unsigned char)>()(address, val);
-}
-
-__device__ void atomicAdd(char *address, char val) {
-  AtomicAddIntegerImpl<char, sizeof(char)>()(address, val);
-}
-
-__device__ void atomicAdd(short *address, short val) {
-  AtomicAddIntegerImpl<short, sizeof(short)>()(address, val);
-}
-
-__device__ void atomicAdd(long *address, long val) {
-  AtomicAddIntegerImpl<long, sizeof(long)>()(address, val);
-}
-
-#ifdef CUDA_HALF_TENSOR
-__device__ void atomicAdd(half *address, half val) {
-  unsigned int * address_as_ui =
-      (unsigned int *) ((char *)address - ((size_t)address & 2));
-  unsigned int old = *address_as_ui;
-  unsigned int assumed;
-
-  do {
-    assumed = old;
-    half hsum;
-    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-    hsum = THCNumerics<half>::add(hsum, val);
-    old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
-    old = atomicCAS(address_as_ui, assumed, old);
-   } while (assumed != old);
-}
-#endif
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-// from CUDA C Programmic Guide
-__device__  void atomicAdd(double *address, double val) {
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull;
-  unsigned long long int assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val +
-                    __longlong_as_double(assumed)));
-
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-  } while (assumed != old);
-}
-#endif
-
 // We prefer this kernel to avoid reloading index points if the number
 // of indices is a small number.
 // This kernel in fact works for all choices of problem size, but if
diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu
index afd262d..d1fe328 100644
--- a/lib/THC/THCTensorMath2.cu
+++ b/lib/THC/THCTensorMath2.cu
@@ -72,7 +72,7 @@ float THCudaTensor_dist(THCState *state, THCudaTensor *self, THCudaTensor *src,
 {
   THAssert(THCudaTensor_checkGPU(state, 2, self, src));
   self = THCudaTensor_newContiguous(state, self);
-  long size = THCudaTensor_nElement(state, self);
+  ptrdiff_t size = THCudaTensor_nElement(state, self);
   src = THCudaTensor_newContiguous(state, src);
   thrust::device_ptr<float> self_data(THCudaTensor_data(state, self));
   thrust::device_ptr<float> src_data(THCudaTensor_data(state, src));
@@ -103,4 +103,3 @@ void THCudaTensor_randn(THCState *state, THCudaTensor *r_, THLongStorage *size)
   THCudaTensor_resize(state, r_, size, NULL);
   THCudaTensor_normal(state, r_, 0, 1);
 }
-
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
index e7d0573..2695f2d 100644
--- a/lib/THC/THCTensorMathPairwise.cu
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -344,21 +344,21 @@ void THCudaTensor_triu(THCState *state, THCudaTensor *self_, THCudaTensor *src_,
 #include "THCGenerateAllTypes.h"
 
 // Copy the kth diagonal of a matrix B to a vector A.
-__global__ void THCudaTensor_copyFromDiagonal(float* a, float* b, long start, long size, long strideSum, long strideA) {
-  for (long linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void THCudaTensor_copyFromDiagonal(float* a, float* b, ptrdiff_t start, ptrdiff_t size, ptrdiff_t strideSum, ptrdiff_t strideA) {
+  for (ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < size;
        linearIndex += gridDim.x * blockDim.x) {
-    const long bOffset = start + strideSum * linearIndex;
+    const ptrdiff_t bOffset = start + strideSum * linearIndex;
     a[strideA * linearIndex] = b[bOffset];
   }
 }
 
 // Copy vector B to the kth diagonal of a matrix A
-__global__ void THCudaTensor_copyToDiagonal(float* a, float* b, long start, long size, long strideSum, long strideB) {
-  for (long linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void THCudaTensor_copyToDiagonal(float* a, float* b, ptrdiff_t start, ptrdiff_t size, ptrdiff_t strideSum, ptrdiff_t strideB) {
+  for (ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < size;
        linearIndex += gridDim.x * blockDim.x) {
-    const long aOffset = start + strideSum * linearIndex;
+    const ptrdiff_t aOffset = start + strideSum * linearIndex;
     a[aOffset] = b[strideB * linearIndex];
   }
 }
@@ -381,16 +381,16 @@ void THCudaTensor_diag(THCState *state, THCudaTensor *self_, THCudaTensor *src_,
     THCudaTensor_copyFromDiagonal<<<grid, threads, 0, THCState_getCurrentStream(state)>>>
     (THCudaTensor_data(state, self_), THCudaTensor_data(state, src_), start, size, stride0 + stride1, strideSelf);
   } else {
-    long totalElements = THCudaTensor_nElement(state, src_);
-    long size = (k > 0) ? totalElements + k : totalElements - k;
+    ptrdiff_t totalElements = THCudaTensor_nElement(state, src_);
+    ptrdiff_t size = (k > 0) ? totalElements + k : totalElements - k;
     long strideSrc = THCudaTensor_stride(state, src_, 0);
     THCudaTensor_resize2d(state, self_, size, size);
     THCudaTensor_zero(state, self_);
     long stride0 = THCudaTensor_stride(state, self_, 0);
     long stride1 = THCudaTensor_stride(state, self_, 1);
     const dim3 threads(min((long long)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (long long)size));
-    dim3 grid(min((long long)1024, (long long)THCCeilDiv(size, (long)threads.x)));
-    long start = (k >= 0 ? k * stride1 : -k * stride0);
+    dim3 grid(min((long long)1024, (long long)THCCeilDiv(size, (ptrdiff_t)threads.x)));
+    ptrdiff_t start = (k >= 0 ? k * stride1 : -k * stride0);
     THCudaTensor_copyToDiagonal<<<grid, threads, 0, THCState_getCurrentStream(state)>>>
     (THCudaTensor_data(state, self_), THCudaTensor_data(state, src_), start, totalElements, stride0 + stride1, strideSrc);
   }
diff --git a/lib/THC/THCTensorMathReduce.cuh b/lib/THC/THCTensorMathReduce.cuh
index 8e368be..db2e424 100644
--- a/lib/THC/THCTensorMathReduce.cuh
+++ b/lib/THC/THCTensorMathReduce.cuh
@@ -123,7 +123,7 @@ struct LogicalAny {
 };
 
 template<typename Real>
-__global__ void THCTensor_kernel_renorm(Real *data, const Real value, const long size, const Real maxnorm)
+__global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrdiff_t size, const Real maxnorm)
 {
   __shared__ Real buffer[32];
   long tx = threadIdx.x;
@@ -134,7 +134,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const long
   buffer[tx] = ScalarConvert<int, Real>::to(0);
 
   // get norm of axis
-  for (long i=tx; i<size; i+=step)
+  for (ptrdiff_t i=tx; i<size; i+=step)
   {
     buffer[tx] = THCNumerics<Real>::add(
       buffer[tx],
@@ -163,7 +163,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const long
       )
     );
     // renormalize
-    for (long i=tx; i<size; i+=step)
+    for (ptrdiff_t i=tx; i<size; i+=step)
     {
       row[i] = THCNumerics<Real>::mul(row[i], norm);
     }
@@ -326,7 +326,7 @@ __host__ void THCTensor_varOuterDim(THCState *state, TensorTypeK *tgt, TensorTyp
   unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
   // Treat all outer dimensions (i.e. dim < dimension) as one.
   unsigned num_orows = 1;
-  for (unsigned dim = 0; dim < dimension; dim++) {
+  for (long dim = 0; dim < dimension; dim++) {
     num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
   }
   unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, dimension);
diff --git a/lib/THC/THCTensorRandom.cu b/lib/THC/THCTensorRandom.cu
index 769a8ba..05b41b3 100644
--- a/lib/THC/THCTensorRandom.cu
+++ b/lib/THC/THCTensorRandom.cu
@@ -238,13 +238,13 @@ __global__ void generate_log_normal(curandStateMtgp32 *state, int size, float *r
   }
 }
 
-#define NUM_BLOCKS min((int)THCCeilDiv(size, (long) BLOCK_SIZE), MAX_NUM_BLOCKS)
+#define NUM_BLOCKS min(THCCeilDiv(size, (ptrdiff_t) BLOCK_SIZE), (ptrdiff_t) MAX_NUM_BLOCKS)
 THC_API void THCudaTensor_uniform(THCState* state, THCudaTensor *self_, double a, double b)
 {
   THAssert(THCudaTensor_checkGPU(state, 1, self_));
   Generator* gen = THCRandom_getGenerator(state);
   THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  long size = THCudaTensor_nElement(state, self);
+  ptrdiff_t size = THCudaTensor_nElement(state, self);
   float *data = THCudaTensor_data(state, self);
 
   generate_uniform<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@@ -258,7 +258,7 @@ THC_API void THCudaTensor_bernoulli(THCState* state, THCudaTensor *self_, double
   THAssert(THCudaTensor_checkGPU(state, 1, self_));
   Generator* gen = THCRandom_getGenerator(state);
   THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  long size = THCudaTensor_nElement(state, self);
+  ptrdiff_t size = THCudaTensor_nElement(state, self);
   float *data = THCudaTensor_data(state, self);
 
   generate_bernoulli<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@@ -272,7 +272,7 @@ THC_API void THCudaTensor_normal(THCState* state, THCudaTensor *self_, double me
   THAssert(THCudaTensor_checkGPU(state, 1, self_));
   Generator* gen = THCRandom_getGenerator(state);
   THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  long size = THCudaTensor_nElement(state, self);
+  ptrdiff_t size = THCudaTensor_nElement(state, self);
   float *data = THCudaTensor_data(state, self);
 
   generate_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@@ -287,7 +287,7 @@ THC_API void THCudaTensor_logNormal(THCState* state, THCudaTensor *self_, double
   Generator* gen = THCRandom_getGenerator(state);
 
   THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  long size = THCudaTensor_nElement(state, self);
+  ptrdiff_t size = THCudaTensor_nElement(state, self);
   float *data = THCudaTensor_data(state, self);
 
   generate_log_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@@ -302,7 +302,7 @@ THC_API void THCudaTensor_geometric(THCState* state, THCudaTensor *self_, double
   Generator* gen = THCRandom_getGenerator(state);
 
   THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  long size = THCudaTensor_nElement(state, self);
+  ptrdiff_t size = THCudaTensor_nElement(state, self);
   float *data = THCudaTensor_data(state, self);
 
   generate_geometric<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@@ -317,7 +317,7 @@ THC_API void THCudaTensor_exponential(THCState* state, THCudaTensor *self_, doub
   Generator* gen = THCRandom_getGenerator(state);
 
   THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  long size = THCudaTensor_nElement(state, self);
+  ptrdiff_t size = THCudaTensor_nElement(state, self);
   float *data = THCudaTensor_data(state, self);
 
   generate_exponential<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@@ -332,7 +332,7 @@ THC_API void THCudaTensor_cauchy(THCState* state, THCudaTensor *self_, double me
   Generator* gen = THCRandom_getGenerator(state);
 
   THCudaTensor *self = THCudaTensor_newContiguous(state, self_);
-  long size = THCudaTensor_nElement(state, self);
+  ptrdiff_t size = THCudaTensor_nElement(state, self);
   float *data = THCudaTensor_data(state, self);
 
   generate_cauchy<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
@@ -704,7 +704,7 @@ THC_API void THCudaTensor_multinomial(struct THCState *state,
 
       // Each warp in a block will generate a sample from a different
       // distribution concurrently.
-      long numBlocks = THCCeilDiv(numDist, 4L);
+      ptrdiff_t numBlocks = THCCeilDiv(numDist, 4L);
       dim3 grid(numBlocks < MAX_NUM_BLOCKS ? numBlocks : MAX_NUM_BLOCKS);
 
       for (int sample = 0; sample < n_sample; ++sample) {
diff --git a/lib/THC/THCTensorSort.cu b/lib/THC/THCTensorSort.cu
index f2dd0e9..589d3e9 100644
--- a/lib/THC/THCTensorSort.cu
+++ b/lib/THC/THCTensorSort.cu
@@ -22,9 +22,9 @@ void THCudaLongTensor_fillSliceWithIndex(THCState* state,
   long dims = THCudaLongTensor_nDimension(state, t);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
 
-  long inElements = THCudaLongTensor_nElement(state, t);
+  ptrdiff_t inElements = THCudaLongTensor_nElement(state, t);
   long sliceSize = THCudaLongTensor_size(state, t, dim);
-  long numSlices = inElements / sliceSize;
+  ptrdiff_t numSlices = inElements / sliceSize;
 
   dim3 grid;
   if (!THC_getGridFromTiles(numSlices, grid)) {
diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu
index 96cd3bc..a273a72 100644
--- a/lib/THC/THCTensorTypeUtils.cu
+++ b/lib/THC/THCTensorTypeUtils.cu
@@ -80,7 +80,7 @@ TensorUtils<TENSOR_TYPE>::getData(THCState* state,                      \
   return (DATA_TYPE*) TENSOR_TYPE##_data(state, t);                     \
 }                                                                       \
                                                                         \
-long                                                                    \
+ptrdiff_t                                                               \
 TensorUtils<TENSOR_TYPE>::getNumElements(THCState* state,               \
                                          TENSOR_TYPE* t) {              \
   return TENSOR_TYPE##_nElement(state, t);                              \
@@ -185,18 +185,18 @@ TensorUtils<TENSOR_TYPE>::overlappingIndices(THCState* state,           \
 bool                                                                    \
 TensorUtils<TENSOR_TYPE>::canUse32BitIndexMath(THCState* state,         \
                                                TENSOR_TYPE* t) {        \
-  long elements = TensorUtils<TENSOR_TYPE>::getNumElements(state, t);   \
+  ptrdiff_t elements = TensorUtils<TENSOR_TYPE>::getNumElements(state, t);   \
   if (elements >= UINT_MAX) {                                           \
     return false;                                                       \
   }                                                                     \
                                                                         \
-  long offset = 0;                                                      \
-  long linearId = elements - 1;                                         \
+  ptrdiff_t offset = 0;                                                 \
+  ptrdiff_t linearId = elements - 1;                                    \
                                                                         \
   for (int i = TensorUtils<TENSOR_TYPE>::getDims(state, t) - 1; i >= 0; --i) { \
-    long curDimIndex =                                                  \
+    ptrdiff_t curDimIndex =                                             \
       linearId % TensorUtils<TENSOR_TYPE>::getSize(state, t, i);        \
-    long curDimOffset = curDimIndex *                                   \
+    ptrdiff_t curDimOffset = curDimIndex *                              \
       TensorUtils<TENSOR_TYPE>::getStride(state, t, i);                 \
     offset += curDimOffset;                                             \
     linearId /= TensorUtils<TENSOR_TYPE>::getSize(state, t, i);         \
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
index 22a2f92..4f5d516 100644
--- a/lib/THC/THCTensorTypeUtils.cuh
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -50,7 +50,7 @@ struct TensorUtils {
     static void resizeAs(THCState* state, TENSOR_TYPE* dst,             \
                          TENSOR_TYPE* src);                             \
     static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t);         \
-    static long getNumElements(THCState* state, TENSOR_TYPE* t);        \
+    static ptrdiff_t getNumElements(THCState* state, TENSOR_TYPE* t);        \
     static long getSize(THCState* state, TENSOR_TYPE* t, int dim);      \
     static long getStride(THCState* state, TENSOR_TYPE* t, int dim);    \
     static int getDims(THCState* state, TENSOR_TYPE* t);                \
diff --git a/lib/THC/generic/THCStorage.c b/lib/THC/generic/THCStorage.c
index 61ba125..ad68526 100644
--- a/lib/THC/generic/THCStorage.c
+++ b/lib/THC/generic/THCStorage.c
@@ -7,7 +7,7 @@ real* THCStorage_(data)(THCState *state, const THCStorage *self)
   return self->data;
 }
 
-long THCStorage_(size)(THCState *state, const THCStorage *self)
+ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage *self)
 {
   return self->size;
 }
@@ -17,13 +17,13 @@ int THCStorage_(elementSize)(THCState *state)
   return sizeof(real);
 }
 
-void THCStorage_(set)(THCState *state, THCStorage *self, long index, real value)
+void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real value)
 {
   THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
   THCudaCheck(cudaMemcpy(self->data + index, &value, sizeof(real), cudaMemcpyHostToDevice));
 }
 
-real THCStorage_(get)(THCState *state, const THCStorage *self, long index)
+real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index)
 {
   THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
   real value;
@@ -41,7 +41,7 @@ THCStorage* THCStorage_(new)(THCState *state)
   return storage;
 }
 
-THCStorage* THCStorage_(newWithSize)(THCState *state, long size)
+THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size)
 {
   THArgCheck(size >= 0, 2, "invalid size");
 
@@ -103,13 +103,13 @@ THCStorage* THCStorage_(newWithSize4)(THCState *state, real data0, real data1, r
   return self;
 }
 
-THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fileName, long size, int isShared)
+THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fileName, ptrdiff_t size, int isShared)
 {
   THError("not available yet for THCStorage");
   return NULL;
 }
 
-THCStorage* THCStorage_(newWithData)(THCState *state, real *data, long size)
+THCStorage* THCStorage_(newWithData)(THCState *state, real *data, ptrdiff_t size)
 {
   THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage));
   storage->data = data;
diff --git a/lib/THC/generic/THCStorage.cu b/lib/THC/generic/THCStorage.cu
index a6bb29a..63bccd7 100644
--- a/lib/THC/generic/THCStorage.cu
+++ b/lib/THC/generic/THCStorage.cu
@@ -12,7 +12,7 @@ void THCStorage_(fill)(THCState *state, THCStorage *self, real value)
     self_data, self_data+self->size, value);
 }
 
-void THCStorage_(resize)(THCState *state, THCStorage *self, long size)
+void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size)
 {
   THArgCheck(size >= 0, 2, "invalid size");
 
diff --git a/lib/THC/generic/THCStorage.h b/lib/THC/generic/THCStorage.h
index 194ad62..a46caad 100644
--- a/lib/THC/generic/THCStorage.h
+++ b/lib/THC/generic/THCStorage.h
@@ -9,7 +9,7 @@
 typedef struct THCStorage
 {
     real *data;
-    long size;
+    ptrdiff_t size;
     int refcount;
     char flag;
     THAllocator *allocator;
@@ -19,36 +19,36 @@ typedef struct THCStorage
 
 
 THC_API real* THCStorage_(data)(THCState *state, const THCStorage*);
-THC_API long THCStorage_(size)(THCState *state, const THCStorage*);
+THC_API ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage*);
 THC_API int THCStorage_(elementSize)(THCState *state);
 
 /* slow access -- checks everything */
-THC_API void THCStorage_(set)(THCState *state, THCStorage*, long, real);
-THC_API real THCStorage_(get)(THCState *state, const THCStorage*, long);
+THC_API void THCStorage_(set)(THCState *state, THCStorage*, ptrdiff_t, real);
+THC_API real THCStorage_(get)(THCState *state, const THCStorage*, ptrdiff_t);
 
 THC_API THCStorage* THCStorage_(new)(THCState *state);
-THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, long size);
+THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size);
 THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, real);
 THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, real, real);
 THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, real, real, real);
 THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, real, real, real, real);
-THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, long size, int shared);
+THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, ptrdiff_t size, int shared);
 
 /* takes ownership of data */
-THC_API THCStorage* THCStorage_(newWithData)(THCState *state, real *data, long size);
+THC_API THCStorage* THCStorage_(newWithData)(THCState *state, real *data, ptrdiff_t size);
 
-THC_API THCStorage* THCStorage_(newWithAllocator)(THCState *state, long size,
+THC_API THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size,
                                                       THAllocator* allocator,
                                                       void *allocatorContext);
 THC_API THCStorage* THCStorage_(newWithDataAndAllocator)(
-    THCState *state, real* data, long size, THAllocator* allocator, void *allocatorContext);
+    THCState *state, real* data, ptrdiff_t size, THAllocator* allocator, void *allocatorContext);
 
 THC_API void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag);
 THC_API void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag);
 THC_API void THCStorage_(retain)(THCState *state, THCStorage *storage);
 
 THC_API void THCStorage_(free)(THCState *state, THCStorage *storage);
-THC_API void THCStorage_(resize)(THCState *state, THCStorage *storage, long size);
+THC_API void THCStorage_(resize)(THCState *state, THCStorage *storage, ptrdiff_t size);
 THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, real value);
 
 THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage);
diff --git a/lib/THC/generic/THCTensor.c b/lib/THC/generic/THCTensor.c
index e18044d..f6c82b5 100644
--- a/lib/THC/generic/THCTensor.c
+++ b/lib/THC/generic/THCTensor.c
@@ -8,7 +8,7 @@ THCStorage *THCTensor_(storage)(THCState *state, const THCTensor *self)
   return self->storage;
 }
 
-long THCTensor_(storageOffset)(THCState *state, const THCTensor *self)
+ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self)
 {
   return self->storageOffset;
 }
@@ -65,7 +65,7 @@ void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag)
 /**** creation methods ****/
 
 static void THCTensor_(rawInit)(THCState *state, THCTensor *self);
-static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, long storageOffset, int nDimension, long *size, long *stride);
+static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride);
 
 
 /* Empty init */
@@ -92,7 +92,7 @@ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor)
 }
 
 /* Storage init */
-THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, long storageOffset, THLongStorage *size, THLongStorage *stride)
+THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride)
 {
   THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor));
   if(size && stride)
@@ -109,20 +109,20 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, long
 
   return self;
 }
-THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage, long storageOffset,
+THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
                                long size0, long stride0)
 {
   return THCTensor_(newWithStorage4d)(state, storage, storageOffset, size0, stride0, -1, -1,  -1, -1,  -1, -1);
 }
 
-THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage, long storageOffset,
+THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
                                long size0, long stride0,
                                long size1, long stride1)
 {
   return THCTensor_(newWithStorage4d)(state, storage, storageOffset, size0, stride0, size1, stride1,  -1, -1,  -1, -1);
 }
 
-THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage, long storageOffset,
+THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
                                long size0, long stride0,
                                long size1, long stride1,
                                long size2, long stride2)
@@ -130,7 +130,7 @@ THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage, lo
   return THCTensor_(newWithStorage4d)(state, storage, storageOffset, size0, stride0, size1, stride1,  size2, stride2,  -1, -1);
 }
 
-THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage, long storageOffset,
+THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset,
                                long size0, long stride0,
                                long size1, long stride1,
                                long size2, long stride2,
@@ -296,7 +296,7 @@ void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src)
                         src->stride);
 }
 
-void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_)
+void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
 {
   if(size_ && stride_)
     THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes");
@@ -310,7 +310,7 @@ void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storag
                       (stride_ ? stride_->data : NULL));
 }
 
-void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_,
+void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
                              long size0_, long stride0_)
 {
   THCTensor_(setStorage4d)(state, self, storage_, storageOffset_,
@@ -320,7 +320,7 @@ void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *stor
                             -1, -1);
 }
 
-void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_,
+void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
                              long size0_, long stride0_,
                              long size1_, long stride1_)
 {
@@ -331,7 +331,7 @@ void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *stor
                             -1, -1);
 }
 
-void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_,
+void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
                              long size0_, long stride0_,
                              long size1_, long stride1_,
                              long size2_, long stride2_)
@@ -343,7 +343,7 @@ void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *stor
                             -1, -1);
 }
 
-void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_,
+void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
                              long size0_, long stride0_,
                              long size1_, long stride1_,
                              long size2_, long stride2_,
@@ -578,13 +578,13 @@ int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTe
   return 1;
 }
 
-long THCTensor_(nElement)(THCState *state, const THCTensor *self)
+ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self)
 {
   if(self->nDimension == 0)
     return 0;
   else
   {
-    long nElement = 1;
+    ptrdiff_t nElement = 1;
     int d;
     for(d = 0; d < self->nDimension; d++)
       nElement *= self->size[d];
@@ -637,7 +637,7 @@ static void THCTensor_(rawInit)(THCState *state, THCTensor *self)
   self->flag = TH_TENSOR_REFCOUNTED;
 }
 
-static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, long storageOffset, int nDimension, long *size, long *stride)
+static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
 {
   /* storage */
   if(self->storage != storage)
@@ -667,7 +667,7 @@ void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, lon
 {
   int d;
   int nDimension_;
-  long totalSize;
+  ptrdiff_t totalSize;
   int hascorrectsize = 1;
 
   nDimension_ = 0;
diff --git a/lib/THC/generic/THCTensor.h b/lib/THC/generic/THCTensor.h
index 175eaee..3c5291e 100644
--- a/lib/THC/generic/THCTensor.h
+++ b/lib/THC/generic/THCTensor.h
@@ -11,7 +11,7 @@ typedef struct THCTensor
     int nDimension;
 
     THCStorage *storage;
-    long storageOffset;
+    ptrdiff_t storageOffset;
     int refcount;
 
     char flag;
@@ -21,7 +21,7 @@ typedef struct THCTensor
 
 /**** access methods ****/
 THC_API THCStorage* THCTensor_(storage)(THCState *state, const THCTensor *self);
-THC_API long THCTensor_(storageOffset)(THCState *state, const THCTensor *self);
+THC_API ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self);
 THC_API int THCTensor_(nDimension)(THCState *state, const THCTensor *self);
 THC_API long THCTensor_(size)(THCState *state, const THCTensor *self, int dim);
 THC_API long THCTensor_(stride)(THCState *state, const THCTensor *self, int dim);
@@ -37,17 +37,17 @@ THC_API void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char
 THC_API THCTensor *THCTensor_(new)(THCState *state);
 THC_API THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor);
 /* stride might be NULL */
-THC_API THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_);
-THC_API THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage_, long storageOffset_,
+THC_API THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+THC_API THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
                                 long size0_, long stride0_);
-THC_API THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage_, long storageOffset_,
+THC_API THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
                                 long size0_, long stride0_,
                                 long size1_, long stride1_);
-THC_API THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage_, long storageOffset_,
+THC_API THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
                                 long size0_, long stride0_,
                                 long size1_, long stride1_,
                                 long size2_, long stride2_);
-THC_API THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage_, long storageOffset_,
+THC_API THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_,
                                 long size0_, long stride0_,
                                 long size1_, long stride1_,
                                 long size2_, long stride2_,
@@ -77,17 +77,17 @@ THC_API void THCTensor_(resize5d)(THCState *state, THCTensor *tensor, long size0
 THC_API void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride);
 
 THC_API void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src);
-THC_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_);
-THC_API void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_,
+THC_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
+THC_API void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
                                     long size0_, long stride0_);
-THC_API void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_,
+THC_API void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
                                     long size0_, long stride0_,
                                     long size1_, long stride1_);
-THC_API void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_,
+THC_API void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
                                     long size0_, long stride0_,
                                     long size1_, long stride1_,
                                     long size2_, long stride2_);
-THC_API void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_,
+THC_API void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_,
                                     long size0_, long stride0_,
                                     long size1_, long stride1_,
                                     long size2_, long stride2_,
@@ -105,7 +105,7 @@ THC_API int THCTensor_(isContiguous)(THCState *state, const THCTensor *self);
 THC_API int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor *src);
 THC_API int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor *src);
 THC_API int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStorage *dims);
-THC_API long THCTensor_(nElement)(THCState *state, const THCTensor *self);
+THC_API ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self);
 
 THC_API void THCTensor_(retain)(THCState *state, THCTensor *self);
 THC_API void THCTensor_(free)(THCState *state, THCTensor *self);
diff --git a/lib/THC/generic/THCTensorIndex.cu b/lib/THC/generic/THCTensorIndex.cu
index e17cebf..ce4c790 100644
--- a/lib/THC/generic/THCTensorIndex.cu
+++ b/lib/THC/generic/THCTensorIndex.cu
@@ -26,7 +26,7 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT
   dims = THCudaLongTensor_nDimension(state, indices);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
 
-  long numIndices = THCudaLongTensor_nElement(state, indices);
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
 
   long srcDims = THCTensor_(nDimension)(state, src);
   cudaStream_t stream = THCState_getCurrentStream(state);
@@ -44,9 +44,9 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT
   // total size of the tensor ignoring dimension `dim`;
   // -the number of indices we are choosing, which is the total size
   // of the tensor `indices`.
-  long srcTotalSize = THCTensor_(nElement)(state, src);
+  ptrdiff_t srcTotalSize = THCTensor_(nElement)(state, src);
   long dstCopyDimSize = THCTensor_(size)(state, dst, dim);
-  long sliceSize = srcTotalSize / numIndices;
+  ptrdiff_t sliceSize = srcTotalSize / numIndices;
 
   int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
 
@@ -62,11 +62,11 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT
       dstInfo, srcInfo, indicesInfo,                            \
       dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize);
 
-  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8)));
-  dim3 smallIndexBlock(std::min(sliceSize, 128L));
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
 
-  dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, 128L), (long)(mpc * 8)));
-  dim3 largeIndexBlock(std::min(srcTotalSize, 128L));
+  dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128));
 
   if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
       TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
@@ -154,7 +154,7 @@ void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTe
   dims = THCudaLongTensor_nDimension(state, indices);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
 
-  long numIndices = THCudaLongTensor_nElement(state, indices);
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
 
   long srcDims = THCTensor_(nDimension)(state, src);
   cudaStream_t stream = THCState_getCurrentStream(state);
@@ -172,9 +172,9 @@ void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTe
   // total size of the tensor ignoring dimension `dim`;
   // -the number of indices we are choosing, which is the total size
   // of the tensor `indices`.
-  long srcTotalSize = THCTensor_(nElement)(state, src);
+  ptrdiff_t srcTotalSize = THCTensor_(nElement)(state, src);
   long dstAddDimSize = THCTensor_(size)(state, dst, dim);
-  long sliceSize = srcTotalSize / numIndices;
+  ptrdiff_t sliceSize = srcTotalSize / numIndices;
 
   int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
 
@@ -190,11 +190,11 @@ void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTe
       dstInfo, srcInfo, indicesInfo,                    \
       dstAddDim, srcAddDim, sliceSize, dstAddDimSize);
 
-  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8)));
-  dim3 smallIndexBlock(std::min(sliceSize, 128L));
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
 
-  dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, 128L), (long)(mpc * 8)));
-  dim3 largeIndexBlock(std::min(srcTotalSize, 128L));
+  dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128));
 
   if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
       TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
@@ -279,7 +279,7 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT
   dims = THCudaLongTensor_nDimension(state, indices);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING);
 
-  long numIndices = THCudaLongTensor_nElement(state, indices);
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
 
   long srcDims = THCTensor_(nDimension)(state, dst);
   cudaStream_t stream = THCState_getCurrentStream(state);
@@ -296,9 +296,9 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT
   // total size of the tensor ignoring dimension `dim`;
   // -the number of indices we are choosing, which is the total size
   // of the tensor `indices`.
-  long dstTotalSize = THCTensor_(nElement)(state, dst);
+  ptrdiff_t dstTotalSize = THCTensor_(nElement)(state, dst);
   long dstFillDimSize = THCTensor_(size)(state, dst, dim);
-  long sliceSize = dstTotalSize / dstFillDimSize;
+  ptrdiff_t sliceSize = dstTotalSize / dstFillDimSize;
 
   int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
 
@@ -314,11 +314,11 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT
       dstInfo, indicesInfo,                             \
       dstFillDim, sliceSize, dstFillDimSize, val);
 
-  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8)));
-  dim3 smallIndexBlock(std::min(sliceSize, 128L));
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
 
-  dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, 128L), (long)(mpc * 8)));
-  dim3 largeIndexBlock(std::min(dstTotalSize, 128L));
+  dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128));
 
   if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
       TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) {
@@ -396,7 +396,7 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in
   dims = THCudaLongTensor_nDimension(state, indices);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING);
 
-  long numIndices = THCudaLongTensor_nElement(state, indices);
+  ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices);
 
   long srcDims = THCTensor_(nDimension)(state, src);
   cudaStream_t stream = THCState_getCurrentStream(state);
@@ -418,9 +418,9 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in
   // total size of the tensor ignoring dimension `dim`;
   // -the number of indices we are choosing, which is the total size
   // of the tensor `indices`.
-  long dstTotalSize = THCTensor_(nElement)(state, dst);
+  ptrdiff_t dstTotalSize = THCTensor_(nElement)(state, dst);
   long srcSelectDimSize = THCTensor_(size)(state, src, dim);
-  long sliceSize = dstTotalSize / numIndices;
+  ptrdiff_t sliceSize = dstTotalSize / numIndices;
 
   int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount;
 
@@ -436,11 +436,11 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in
       dstInfo, srcInfo, indicesInfo,                                    \
       dstSelectDim, srcSelectDim, dstTotalSize, sliceSize, srcSelectDimSize);
 
-  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8)));
-  dim3 smallIndexBlock(std::min(sliceSize, 128L));
+  dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128));
 
-  dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, 128L), (long)(mpc * 8)));
-  dim3 largeIndexBlock(std::min(dstTotalSize, 128L));
+  dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128));
 
   if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) &&
       TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) &&
diff --git a/lib/THC/generic/THCTensorMasked.cu b/lib/THC/generic/THCTensorMasked.cu
index e6a5704..333af29 100644
--- a/lib/THC/generic/THCTensorMasked.cu
+++ b/lib/THC/generic/THCTensorMasked.cu
@@ -38,16 +38,16 @@ THCTensor_(maskedCopy)(THCState* state,
                        THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
 {
   THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
-  long maskSize = THCudaByteTensor_nElement(state, mask);
-  long tensorSize = THCTensor_(nElement)(state, tensor);
-  long srcSize = THCTensor_(nElement)(state, src);
+  ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask);
+  ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor);
+  ptrdiff_t srcSize = THCTensor_(nElement)(state, src);
 
   // `mask` and `tensor` must have the same number of elements
   THArgCheck(maskSize == tensorSize, 2,
              "mask and tensor must have the same number of elements");
 
   // Determine our output size
-  long totalElements = THCudaByteTensor_sumall(state, mask);
+  ptrdiff_t totalElements = THCudaByteTensor_sumall(state, mask);
 
   // The number of `1` elements present in the mask must be <= the
   // number of elements available in `src`
@@ -121,7 +121,7 @@ THCTensor_(maskedSelect)(THCState* state,
              2, "sizes do not match");
 
   // Determine our output size
-  long totalElements = THCudaByteTensor_sumall(state, mask);
+  ptrdiff_t totalElements = THCudaByteTensor_sumall(state, mask);
   THCTensor* tensorContig = THCTensor_(newContiguous)(state, tensor);
 
   THCTensor_(resize1d)(state, tensorContig, totalElements);
diff --git a/lib/THC/generic/THCTensorMath.cu b/lib/THC/generic/THCTensorMath.cu
index a0e550a..557f8f5 100644
--- a/lib/THC/generic/THCTensorMath.cu
+++ b/lib/THC/generic/THCTensorMath.cu
@@ -59,7 +59,7 @@ THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage
   THCTensor_(copy)(state, r_, t);
 }
 
-long
+ptrdiff_t
 THCTensor_(numel)(THCState *state, THCTensor *t)
 {
   return THCTensor_(nElement)(state, t);
diff --git a/lib/THC/generic/THCTensorMath.h b/lib/THC/generic/THCTensorMath.h
index 5c9e66d..cfc706a 100644
--- a/lib/THC/generic/THCTensorMath.h
+++ b/lib/THC/generic/THCTensorMath.h
@@ -8,6 +8,6 @@ THC_API void THCTensor_(zero)(THCState *state, THCTensor *self);
 THC_API void THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size);
 THC_API void THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size);
 THC_API void THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size);
-THC_API long THCTensor_(numel)(THCState *state, THCTensor *t);
+THC_API ptrdiff_t THCTensor_(numel)(THCState *state, THCTensor *t);
 
 #endif
diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu
index 90ddfbc..2638504 100644
--- a/lib/THC/generic/THCTensorMathPointwise.cu
+++ b/lib/THC/generic/THCTensorMathPointwise.cu
@@ -108,7 +108,7 @@ THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y,
 
   int i;
   long nd = THCTensor_(nDimension)(state, x);
-  long nelem = THCTensor_(nElement)(state, x);
+  ptrdiff_t nelem = THCTensor_(nElement)(state, x);
   THArgCheck(nd == THCTensor_(nDimension)(state, y), 1, "tensors must have same number of dimensions");
   for (i = 0; i < nd; i++) {
     THArgCheck(THCTensor_(size)(state, x, i) == THCTensor_(size)(state, y, i), 1, "dimension %i of x and y does not match", i);
diff --git a/lib/THC/generic/THCTensorMathReduce.cu b/lib/THC/generic/THCTensorMathReduce.cu
index 502fa75..1e21d03 100644
--- a/lib/THC/generic/THCTensorMathReduce.cu
+++ b/lib/THC/generic/THCTensorMathReduce.cu
@@ -47,7 +47,7 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value,
   THCTensor *self_;
   THCTensor *src_ = THCTensor_(newTranspose)(state, src, dimension, 0);
   THCTensor *data = THCTensor_(newClone)(state, src_);
-  long size = THCTensor_(nElement)(state, data)/data->size[0];
+  ptrdiff_t size = THCTensor_(nElement)(state, data)/data->size[0];
 
   THArgCheck(dimension >= 0 && dimension < THCTensor_(nDimension)(state, src), 3, "invalid dimension");
   THArgCheck(THCNumerics<real>::gt(value, ScalarConvert<int, real>::to(0)), 2, "non-positive-norm not supported");
@@ -138,7 +138,7 @@ THCTensor_(varall)(THCState *state, THCTensor *self)
 
   val = THCNumerics<accreal>::div(
     val,
-    ScalarConvert<int, accreal>::to(THCTensor_(nElement)(state, self) - 1)
+    ScalarConvert<ptrdiff_t, accreal>::to(THCTensor_(nElement)(state, self) - 1)
   );
 
   THCudaCheck(cudaGetLastError());
diff --git a/lib/THC/generic/THCTensorScatterGather.cu b/lib/THC/generic/THCTensorScatterGather.cu
index 7842d3d..c120f88 100644
--- a/lib/THC/generic/THCTensorScatterGather.cu
+++ b/lib/THC/generic/THCTensorScatterGather.cu
@@ -34,7 +34,7 @@ void THCTensor_(gather)(THCState* state, THCTensor *tensor,
              1, CUTORCH_DIM_WARNING);
 
 
-  const long totalElements = THCudaLongTensor_nElement(state, index);
+  const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
   const dim3 block = getApplyBlock();
   dim3 grid;
   THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
@@ -126,7 +126,7 @@ void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLong
   THArgCheck(THCTensor_(nDimension)(state, tensor) <= MAX_CUTORCH_DIMS,
              1, CUTORCH_DIM_WARNING);
 
-  const long totalElements = THCudaLongTensor_nElement(state, index);
+  const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
   const dim3 block = getApplyBlock();
   dim3 grid;
   THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
@@ -211,7 +211,7 @@ THCTensor_(scatterFill)(THCState* state, THCTensor *tensor,
   THArgCheck(THCTensor_(nDimension)(state, tensor) <= MAX_CUTORCH_DIMS,
              1, CUTORCH_DIM_WARNING);
 
-  const long totalElements = THCudaLongTensor_nElement(state, index);
+  const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index);
   const dim3 block = getApplyBlock();
   dim3 grid;
   THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING);
diff --git a/lib/THC/generic/THCTensorSort.cu b/lib/THC/generic/THCTensorSort.cu
index 0116522..046c62b 100644
--- a/lib/THC/generic/THCTensorSort.cu
+++ b/lib/THC/generic/THCTensorSort.cu
@@ -18,9 +18,9 @@ THC_API void THCTensor_(sortKeyValueInplace)(THCState* state,
   dims = THCTensor_(nDimension)(state, key);
   THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
 
-  long inElements = THCTensor_(nElement)(state, key);
+  ptrdiff_t inElements = THCTensor_(nElement)(state, key);
   long keySliceSize = THCTensor_(size)(state, key, dim);
-  long keySlices = inElements / keySliceSize;
+  ptrdiff_t keySlices = inElements / keySliceSize;
 
   if (THCTensor_(nDimension)(state, key) == 0) {
     // Zero-dim tensor; do nothing
@@ -160,7 +160,7 @@ void sortViaThrust(THCState* state,
                    int dim, bool dir) {
   long nDims = THCTensor_(nDimension)(state, input);
 
-  long totalElements = THCTensor_(nElement)(state, input);
+  ptrdiff_t totalElements = THCTensor_(nElement)(state, input);
   long sliceSize = THCTensor_(size)(state, input, dim);
   long sliceStride = THCTensor_(stride)(state, input, dim);
 
diff --git a/torch/generic/Storage.c b/torch/generic/Storage.c
index c57c416..e5c0836 100644
--- a/torch/generic/Storage.c
+++ b/torch/generic/Storage.c
@@ -10,13 +10,13 @@ static int torch_Storage_(new)(lua_State *L)
   {
     const char *fileName = luaL_checkstring(L, 1);
     int isShared = luaT_optboolean(L, 2, 0);
-    long size = luaL_optlong(L, 3, 0);
+    ptrdiff_t size = luaL_optinteger(L, 3, 0);
     storage = THCStorage_(newWithMapping)(state, fileName, size, isShared);
   }
   else if(lua_type(L, 1) == LUA_TTABLE)
   {
-    long size = lua_objlen(L, 1);
-    long i;
+    ptrdiff_t size = lua_objlen(L, 1);
+    ptrdiff_t i;
     storage = THCStorage_(newWithSize)(state, size);
     for(i = 1; i <= size; i++)
     {
@@ -39,11 +39,11 @@ static int torch_Storage_(new)(lua_State *L)
   {
     THCStorage *src = luaT_checkudata(L, 1, torch_Storage);
     real *ptr = src->data;
-    long offset = luaL_optlong(L, 2, 1) - 1;
+    ptrdiff_t offset = luaL_optinteger(L, 2, 1) - 1;
     if (offset < 0 || offset >= src->size) {
       luaL_error(L, "offset out of bounds");
     }
-    long size = luaL_optlong(L, 3, src->size - offset);
+    ptrdiff_t size = luaL_optinteger(L, 3, src->size - offset);
     if (size < 1 || size > (src->size - offset)) {
       luaL_error(L, "size out of bounds");
     }
@@ -54,14 +54,14 @@ static int torch_Storage_(new)(lua_State *L)
   }
   else if(lua_type(L, 2) == LUA_TNUMBER)
   {
-    long size = luaL_optlong(L, 1, 0);
+    ptrdiff_t size = luaL_optinteger(L, 1, 0);
     real *ptr = (real *)luaL_optinteger(L, 2, 0);
     storage = THCStorage_(newWithData)(state, ptr, size);
     storage->flag = TH_STORAGE_REFCOUNTED;
   }
   else
   {
-    long size = luaL_optlong(L, 1, 0);
+    ptrdiff_t size = luaL_optinteger(L, 1, 0);
     storage = THCStorage_(newWithSize)(state, size);
   }
   luaT_pushudata(L, storage, torch_Storage);
@@ -85,7 +85,7 @@ static int torch_Storage_(free)(lua_State *L)
 static int torch_Storage_(resize)(lua_State *L)
 {
   THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
-  long size = luaL_checklong(L, 2);
+  ptrdiff_t size = luaL_checkinteger(L, 2);
 /*  int keepContent = luaT_optboolean(L, 3, 0); */
   THCStorage_(resize)(cutorch_getstate(L), storage, size);/*, keepContent); */
   lua_settop(L, 1);
@@ -141,7 +141,7 @@ static int torch_Storage_(elementSize)(lua_State *L)
 static int torch_Storage_(__len__)(lua_State *L)
 {
   THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
-  lua_pushnumber(L, storage->size);
+  lua_pushinteger(L, storage->size);
   return 1;
 }
 
@@ -150,7 +150,7 @@ static int torch_Storage_(__newindex__)(lua_State *L)
   if(lua_isnumber(L, 2))
   {
     THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
-    long index = luaL_checklong(L, 2) - 1;
+    ptrdiff_t index = luaL_checkinteger(L, 2) - 1;
     double number = luaL_checknumber(L, 3);
 
 #ifdef THC_REAL_IS_HALF
@@ -172,7 +172,7 @@ static int torch_Storage_(__index__)(lua_State *L)
   if(lua_isnumber(L, 2))
   {
     THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
-    long index = luaL_checklong(L, 2) - 1;
+    ptrdiff_t index = luaL_checkinteger(L, 2) - 1;
     real v = THCStorage_(get)(cutorch_getstate(L), storage, index);
 
 #ifdef THC_REAL_IS_HALF
@@ -196,7 +196,7 @@ static int torch_Storage_(totable)(lua_State *L)
 {
   THCState *state = cutorch_getstate(L);
   THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
-  long i;
+  ptrdiff_t i;
 
   /* Copy storage from device to host. */
 #ifndef THC_REAL_IS_HALF
@@ -235,6 +235,9 @@ static int torch_Storage_(write)(lua_State *L)
   THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
   THFile *file = luaT_checkudata(L, 2, "torch.File");
 
+#ifdef _MSC_VER
+  THAssert(storage->size < LONG_MAX);
+#endif
   THFile_writeLongScalar(file, storage->size);
   THFile_writeRealRaw(file, storage->data, storage->size);
 
diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c
index b3c85fe..d7dcd53 100644
--- a/torch/generic/Tensor.c
+++ b/torch/generic/Tensor.c
@@ -5,7 +5,7 @@
 #include "THCHalf.h"
 
 static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride,
-                                                         THCStorage **storage_, long *storageOffset_, THLongStorage **size_, THLongStorage **stride_);
+                                                         THCStorage **storage_, ptrdiff_t *storageOffset_, THLongStorage **size_, THLongStorage **stride_);
 
 static void torch_Tensor_(c_readSizeStride)(lua_State *L, int index, int allowStride, THLongStorage **size_, THLongStorage **stride_);
 
@@ -75,7 +75,7 @@ static int torch_Tensor_(storage)(lua_State *L)
 static int torch_Tensor_(storageOffset)(lua_State *L)
 {
   THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
-  lua_pushnumber(L, tensor->storageOffset+1);
+  lua_pushinteger(L, tensor->storageOffset+1);
   return 1;
 }
 
@@ -83,14 +83,14 @@ static int torch_Tensor_(new)(lua_State *L)
 {
   THCState *state = cutorch_getstate(L);
   THCTensor *tensor;
-  long storageOffset;
+  ptrdiff_t storageOffset;
   THLongStorage *size, *stride;
 
   if(lua_type(L, 1) == LUA_TTABLE)
   {
-    long i, j;
+    ptrdiff_t i, j;
     THLongStorage *counter;
-    long si = 0;
+    ptrdiff_t si = 0;
     int dimension = 0;
     int is_finished = 0;
 
@@ -221,7 +221,7 @@ static int torch_Tensor_(set)(lua_State *L)
 {
   THCTensor *self = luaT_checkudata(L, 1, torch_Tensor);
   THCStorage *storage;
-  long storageOffset;
+  ptrdiff_t storageOffset;
   THLongStorage *size, *stride;
 
   torch_Tensor_(c_readTensorStorageSizeStride)(L, 2, 1, 1, 1, 1,
@@ -691,7 +691,7 @@ static int torch_Tensor_(isSameSizeAs)(lua_State *L)
 static int torch_Tensor_(nElement)(lua_State *L)
 {
   THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
-  lua_pushnumber(L, THCTensor_(nElement)(cutorch_getstate(L), tensor));
+  lua_pushinteger(L, THCTensor_(nElement)(cutorch_getstate(L), tensor));
   return 1;
 }
 
@@ -801,7 +801,7 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
   }
   else if((idx = luaT_toudata(L, 2, "torch.LongStorage")))
   {
-    long index = THCTensor_(storageOffset)(state, tensor);
+    ptrdiff_t index = THCTensor_(storageOffset)(state, tensor);
 
 #ifdef THC_REAL_IS_HALF
     real value = THC_float2half((float) luaL_checknumber(L,3));
@@ -809,7 +809,7 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
     real value = (real)luaL_checknumber(L,3);
 #endif
 
-    int dim;
+    ptrdiff_t dim;
 
     luaL_argcheck(L, idx->size == tensor->nDimension, 2, "invalid size");
 
@@ -1014,7 +1014,7 @@ static int torch_Tensor_(__index__)(lua_State *L)
 
   if(lua_isnumber(L, 2))
   {
-    long index = luaL_checklong(L,2)-1;
+    ptrdiff_t index = luaL_checkinteger(L,2)-1;
 
     luaL_argcheck(L, tensor->nDimension > 0, 1, "empty tensor");
     if (index < 0) index = tensor->size[0] + index + 1;
@@ -1045,8 +1045,8 @@ static int torch_Tensor_(__index__)(lua_State *L)
   }
   else if((idx = luaT_toudata(L, 2, "torch.LongStorage")))
   {
-    long index = THCTensor_(storageOffset)(state, tensor);
-    int dim;
+    ptrdiff_t index = THCTensor_(storageOffset)(state, tensor);
+    ptrdiff_t dim;
 
     luaL_argcheck(L, idx->size == tensor->nDimension, 2, "invalid size");
 
@@ -1258,7 +1258,7 @@ static void torch_Tensor_(c_readSizeStride)(lua_State *L, int index, int allowSt
 }
 
 static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride,
-                                                         THCStorage **storage_, long *storageOffset_, THLongStorage **size_, THLongStorage **stride_)
+                                                         THCStorage **storage_, ptrdiff_t *storageOffset_, THLongStorage **size_, THLongStorage **stride_)
 {
   THCState *state = cutorch_getstate(L);
   THCTensor *src = NULL;
@@ -1293,7 +1293,7 @@ static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index
     }
     else
     {
-      *storageOffset_ = luaL_checklong(L, index+1)-1;
+      *storageOffset_ = luaL_checkinteger(L, index+1)-1;
       torch_Tensor_(c_readSizeStride)(L, index+2, allowStride, size_, stride_);
     }
     return;
@@ -1335,6 +1335,9 @@ static int torch_Tensor_(write)(lua_State *L)
   THFile_writeIntScalar(file, tensor->nDimension);
   THFile_writeLongRaw(file, tensor->size, tensor->nDimension);
   THFile_writeLongRaw(file, tensor->stride, tensor->nDimension);
+#ifdef _MSC_VER
+  THAssert(tensor->storageOffset+1 < LONG_MAX);
+#endif
   THFile_writeLongScalar(file, tensor->storageOffset+1); /* to respect Lua convention */
 
   lua_getfield(L, 2, "writeObject"); /* the method */
author	Trevor Killeen <killeentm@gmail.com>	2016-10-10 20:09:43 +0300
committer	Trevor Killeen <killeentm@gmail.com>	2016-10-10 20:09:43 +0300
commit	e4ebbd3a2d668ae51d49d9deaf2c62fc6de81a8e (patch)
tree	c0c3cbf24139a1504ff232476dd7b64987605821
parent	7860a76e1cc50e5c679a965c95cdca2501cac9bc (diff)
parent	9efd392d4974e3fe3b1469809971d5d4f360ec7b (diff)