diff options
author | Trevor Killeen <killeentm@gmail.com> | 2016-10-10 20:09:43 +0300 |
---|---|---|
committer | Trevor Killeen <killeentm@gmail.com> | 2016-10-10 20:09:43 +0300 |
commit | e4ebbd3a2d668ae51d49d9deaf2c62fc6de81a8e (patch) | |
tree | c0c3cbf24139a1504ff232476dd7b64987605821 | |
parent | 7860a76e1cc50e5c679a965c95cdca2501cac9bc (diff) | |
parent | 9efd392d4974e3fe3b1469809971d5d4f360ec7b (diff) |
Merge remote-tracking branch 'upstream/master' into more-generic-functions
43 files changed, 404 insertions, 338 deletions
@@ -50,7 +50,7 @@ cudaStream_t THCState_getCurrentStream(THCState *state); typedef struct THCStorage { real *data; - long size; + ptrdiff_t size; int refcount; char flag; THAllocator *allocator; @@ -65,7 +65,7 @@ typedef struct THCTensor int nDimension; THCStorage *storage; - long storageOffset; + ptrdiff_t storageOffset; int refcount; char flag; diff --git a/TensorMath.lua b/TensorMath.lua index abdda0b..e917f8c 100644 --- a/TensorMath.lua +++ b/TensorMath.lua @@ -2,6 +2,58 @@ local wrap = require 'cwrap' local interface = wrap.CInterface.new() local method = wrap.CInterface.new() +local argtypes = wrap.CInterface.argtypes + +argtypes['ptrdiff_t'] = { + + helpname = function(arg) + return 'ptrdiff_t' + end, + + declare = function(arg) + -- if it is a number we initialize here + local default = tonumber(tostring(arg.default)) or 0 + return string.format("%s arg%d = %g;", 'ptrdiff_t', arg.i, default) + end, + + check = function(arg, idx) + return string.format("lua_isinteger(L, %d)", idx) + end, + + read = function(arg, idx) + return string.format("arg%d = (%s)lua_tointeger(L, %d);", arg.i, 'ptrdiff_t', idx) + end, + + init = function(arg) + -- otherwise do it here + if arg.default then + local default = tostring(arg.default) + if not tonumber(default) then + return string.format("arg%d = %s;", arg.i, default) + end + end + end, + + carg = function(arg) + return string.format('arg%d', arg.i) + end, + + creturn = function(arg) + return string.format('arg%d', arg.i) + end, + + precall = function(arg) + if arg.returned then + return string.format('lua_pushinteger(L, (lua_Integer)arg%d);', arg.i) + end + end, + + postcall = function(arg) + if arg.creturned then + return string.format('lua_pushinteger(L, (lua_Integer)arg%d);', arg.i) + end + end +} interface:print('/* WARNING: autogenerated file */') interface:print('') @@ -559,7 +611,7 @@ for k, Tensor_ in pairs(handledTypenames) do wrap("numel", cname("numel"), {{name=Tensor}, - {name="long", creturned=true}}) + {name="ptrdiff_t", creturned=true}}) wrap("add", cname("add"), diff --git a/generic/CTensor.c b/generic/CTensor.c index 5569c02..a9663ff 100644 --- a/generic/CTensor.c +++ b/generic/CTensor.c @@ -173,16 +173,16 @@ void THFloatTensor_kernel_copy(float *dst, long *dst_sz, long *dst_st, int dst_dim, float *src, long *src_sz, long *src_st, int src_dim, - long n_elem) + ptrdiff_t n_elem) { - long k; + ptrdiff_t k; for(k = 0; k < n_elem; k++) { - long src_idx = 0; - long src_rest = k; - long dst_idx = 0; - long dst_rest = k; + ptrdiff_t src_idx = 0; + ptrdiff_t src_rest = k; + ptrdiff_t dst_idx = 0; + ptrdiff_t dst_rest = k; int dim; for(dim = 0; dim < dst_dim; dim++) @@ -206,7 +206,7 @@ static int cuda_FloatTensor_fakecopy(lua_State *L) THFloatTensor *self = luaT_checkudata(L, 1, "torch.FloatTensor"); THFloatTensor *src = luaT_checkudata(L, 2, "torch.FloatTensor"); long *d_self_sz, *d_self_st, *d_src_sz, *d_src_st; - long nElement = THFloatTensor_nElement(self); + ptrdiff_t nElement = THFloatTensor_nElement(self); THArgCheck(THFloatTensor_nElement(self) == THFloatTensor_nElement(src), 2, "sizes do not match"); diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt index 181bc9d..b9ddfbe 100644 --- a/lib/THC/CMakeLists.txt +++ b/lib/THC/CMakeLists.txt @@ -211,6 +211,7 @@ INSTALL(FILES THCReduceAll.cuh THCReduceApplyUtils.cuh THCAsmUtils.cuh + THCAtomics.cuh THCScanUtils.cuh THCSortUtils.cuh THCAllocator.h diff --git a/lib/THC/THCAllocator.c b/lib/THC/THCAllocator.c index e2cd09a..1bed0fb 100644 --- a/lib/THC/THCAllocator.c +++ b/lib/THC/THCAllocator.c @@ -1,6 +1,6 @@ #include "THCAllocator.h" -static void *THCudaHostAllocator_alloc(void* ctx, long size) { +static void *THCudaHostAllocator_alloc(void* ctx, ptrdiff_t size) { void* ptr; if (size < 0) THError("Invalid memory size: %ld", size); @@ -18,7 +18,7 @@ static void THCudaHostAllocator_free(void* ctx, void* ptr) { THCudaCheck(cudaFreeHost(ptr)); } -static void *THCudaHostAllocator_realloc(void* ctx, void* ptr, long size) { +static void *THCudaHostAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) { if (size < 0) THError("Invalid memory size: %ld", size); THCudaHostAllocator_free(ctx, ptr); diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh index dd6d32a..a47e303 100644 --- a/lib/THC/THCApply.cuh +++ b/lib/THC/THCApply.cuh @@ -101,7 +101,7 @@ inline dim3 getApplyBlock() { return dim3(THC_APPLY_THREADS_PER_BLOCK); } -inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) { +inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) { int curDevice = -1; cudaGetDevice(&curDevice); @@ -116,7 +116,7 @@ inline bool getApplyGrid(THCState* state, long totalElements, dim3& grid) { // 16 warps per block * 4 per SM gives 64 warps per SM at maximum, // which seems to be a good sweetspot for latency hiding grid = dim3(min((long long) THCCeilDiv(totalElements, - (long) THC_APPLY_THREADS_PER_BLOCK), + (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), 4LL * numSM)); return true; } @@ -139,7 +139,7 @@ bool THC_pointwiseApply1(THCState* state, const dim3 block = getApplyBlock(); dim3 grid; - long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a); + ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a); if (!getApplyGrid(state, totalElements, grid)) { return false; @@ -253,7 +253,7 @@ bool THC_pointwiseApply2(THCState* state, const Op& op, TensorArgType aType = ReadWrite, TensorArgType bType = ReadOnly) { - long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a); + ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a); if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b)) { return false; @@ -431,7 +431,7 @@ bool THC_pointwiseApply3(THCState* state, TensorArgType aType = ReadWrite, TensorArgType bType = ReadOnly, TensorArgType cType = ReadOnly) { - long totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a); + ptrdiff_t totalElements = TensorUtils<TensorTypeA>::getNumElements(state, a); if (totalElements != TensorUtils<TensorTypeB>::getNumElements(state, b) || totalElements != TensorUtils<TensorTypeC>::getNumElements(state, c)) { diff --git a/lib/THC/THCAtomics.cuh b/lib/THC/THCAtomics.cuh new file mode 100644 index 0000000..0586edf --- /dev/null +++ b/lib/THC/THCAtomics.cuh @@ -0,0 +1,131 @@ +#ifndef THC_ATOMICS_INC +#define THC_ATOMICS_INC + +#include "THCHalf.h" + +template <typename T, size_t n> +struct AtomicAddIntegerImpl; + +template<typename T> +struct AtomicAddIntegerImpl<T, 1> { + __device__ void operator()(T *address, T val) { + unsigned int * address_as_ui = + (unsigned int *) (address - ((size_t)address & 3)); + unsigned int old = *address_as_ui; + unsigned int shift = (((size_t)address & 3) * 8); + unsigned int sum; + unsigned int assumed; + + do { + assumed = old; + sum = val + T((old >> shift) & 0xff); + old = (old & ~(0x000000ff << shift)) | (sum << shift); + old = atomicCAS(address_as_ui, assumed, old); + } while (assumed != old); + } +}; + +template<typename T> +struct AtomicAddIntegerImpl<T, 2> { + __device__ void operator()(T *address, T val) { + unsigned int * address_as_ui = + (unsigned int *) ((char *)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int sum; + unsigned int newval; + unsigned int assumed; + + do { + assumed = old; + sum = val + (size_t)address & 2 ? T(old >> 16) : T(old & 0xffff); + newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16) : (old & 0xffff0000) | sum; + old = atomicCAS(address_as_ui, assumed, newval); + } while (assumed != old); + } +}; + +template<typename T> +struct AtomicAddIntegerImpl<T, 4> { + __device__ void operator()(T *address, T val) { + unsigned int * address_as_ui = (unsigned int *) (address); + unsigned int old = *address_as_ui; + unsigned int newval; + unsigned int assumed; + + do { + assumed = old; + newval = val + (T)old; + old = atomicCAS(address_as_ui, assumed, newval); + } while (assumed != old); + } +}; + +template<typename T> +struct AtomicAddIntegerImpl<T, 8> { + __device__ void operator()(T *address, T val) { + unsigned long long * address_as_ui = (unsigned long long *) (address); + unsigned long long old = *address_as_ui; + unsigned long long newval; + unsigned long long assumed; + + do { + assumed = old; + newval = val + (T)old; + old = atomicCAS(address_as_ui, assumed, newval); + } while (assumed != old); + } +}; + +__device__ void atomicAdd(unsigned char *address, unsigned char val) { + AtomicAddIntegerImpl<unsigned char, sizeof(unsigned char)>()(address, val); +} + +__device__ void atomicAdd(char *address, char val) { + AtomicAddIntegerImpl<char, sizeof(char)>()(address, val); +} + +__device__ void atomicAdd(short *address, short val) { + AtomicAddIntegerImpl<short, sizeof(short)>()(address, val); +} + +__device__ void atomicAdd(long *address, long val) { + AtomicAddIntegerImpl<long, sizeof(long)>()(address, val); +} + +#ifdef CUDA_HALF_TENSOR +__device__ void atomicAdd(half *address, half val) { + unsigned int * address_as_ui = + (unsigned int *) ((char *)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; + + do { + assumed = old; + half hsum; + hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + hsum = THCNumerics<half>::add(hsum, val); + old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; + old = atomicCAS(address_as_ui, assumed, old); + } while (assumed != old); +} +#endif + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +// from CUDA C Programmic Guide +__device__ void atomicAdd(double *address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull; + unsigned long long int assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); +} +#endif + +#endif // THC_ATOMICS_INC diff --git a/lib/THC/THCDeviceTensor-inl.cuh b/lib/THC/THCDeviceTensor-inl.cuh index a0058ad..9237d07 100644 --- a/lib/THC/THCDeviceTensor-inl.cuh +++ b/lib/THC/THCDeviceTensor-inl.cuh @@ -103,9 +103,9 @@ THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const { template <typename T, int Dim, typename IndexT, template <typename U> class PtrTraits> -__host__ __device__ long +__host__ __device__ ptrdiff_t THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const { - long size = getSize(0); + ptrdiff_t size = getSize(0); for (int i = 1; i < Dim; ++i) { size *= getSize(i); diff --git a/lib/THC/THCDeviceTensor.cuh b/lib/THC/THCDeviceTensor.cuh index 14d56c6..c6b7899 100644 --- a/lib/THC/THCDeviceTensor.cuh +++ b/lib/THC/THCDeviceTensor.cuh @@ -142,7 +142,7 @@ class THCDeviceTensor { /// Returns the total number of elements contained within our data /// (product of `getSize(i)`) - __host__ __device__ long numElements() const; + __host__ __device__ ptrdiff_t numElements() const; /// Returns the size array. __host__ __device__ __forceinline__ const IndexT* sizes() const { diff --git a/lib/THC/THCDeviceTensorUtils-inl.cuh b/lib/THC/THCDeviceTensorUtils-inl.cuh index f954772..26c1bb8 100644 --- a/lib/THC/THCDeviceTensorUtils-inl.cuh +++ b/lib/THC/THCDeviceTensorUtils-inl.cuh @@ -10,7 +10,7 @@ toDeviceTensor(THCState* state, THCudaTensor* t) { // Determine the maximum offset into the tensor achievable; `IndexT` // must be smaller than this type in order to use it. - long maxOffset = 0; + ptrdiff_t maxOffset = 0; IndexT sizes[Dim]; IndexT strides[Dim]; diff --git a/lib/THC/THCGeneral.c b/lib/THC/THCGeneral.c index 07dbf2c..5bcce19 100644 --- a/lib/THC/THCGeneral.c +++ b/lib/THC/THCGeneral.c @@ -60,8 +60,8 @@ struct THCState { void (*cutorchGCFunction)(void *data); void *cutorchGCData; - long heapSoftmax; - long heapDelta; + ptrdiff_t heapSoftmax; + ptrdiff_t heapDelta; }; THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr( @@ -640,8 +640,9 @@ void __THCublasCheck(cublasStatus_t status, const char *file, const int line) } } -static long heapSize = 0; // not thread-local -static const long heapMaxDelta = 1e6; +static ptrdiff_t heapSize = 0; // not thread-local +static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6; +static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6; static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40% @@ -691,8 +692,8 @@ cudaError_t THCudaFree(THCState *state, void *ptr) return allocator->free(allocator->state, ptr); } -static long applyHeapDelta(THCState *state) { - long newHeapSize = THAtomicAddLong(&heapSize, state->heapDelta) + state->heapDelta; +static ptrdiff_t applyHeapDelta(THCState *state) { + ptrdiff_t newHeapSize = THAtomicAddPtrdiff(&heapSize, state->heapDelta) + state->heapDelta; state->heapDelta = 0; return newHeapSize; } @@ -701,27 +702,27 @@ static long applyHeapDelta(THCState *state) { // When THC heap size goes above this softmax, the GC hook is triggered. // If heap size is above 80% of the softmax after GC, then the softmax is // increased. -static void maybeTriggerGC(THCState *state, long curHeapSize) { +static void maybeTriggerGC(THCState *state, ptrdiff_t curHeapSize) { if (state->cutorchGCFunction != NULL && curHeapSize > state->heapSoftmax) { (state->cutorchGCFunction)(state->cutorchGCData); // ensure heapSize is accurate before updating heapSoftmax - long newHeapSize = applyHeapDelta(state); + ptrdiff_t newHeapSize = applyHeapDelta(state); if (newHeapSize > state->heapSoftmax * heapSoftmaxGrowthThresh) { - state->heapSoftmax = state->heapSoftmax * heapSoftmaxGrowthFactor; + state->heapSoftmax = (ptrdiff_t)state->heapSoftmax * heapSoftmaxGrowthFactor; } } } -void THCHeapUpdate(THCState *state, long size) { +void THCHeapUpdate(THCState *state, ptrdiff_t size) { state->heapDelta += size; // batch updates to global heapSize to minimize thread contention - if (labs(state->heapDelta) < heapMaxDelta) { + if (state->heapDelta < heapMaxDelta && state->heapDelta > heapMinDelta) { return; } - long newHeapSize = applyHeapDelta(state); + ptrdiff_t newHeapSize = applyHeapDelta(state); if (size > 0) { maybeTriggerGC(state, newHeapSize); } diff --git a/lib/THC/THCGeneral.h.in b/lib/THC/THCGeneral.h.in index 8837f42..ce3ffc5 100644 --- a/lib/THC/THCGeneral.h.in +++ b/lib/THC/THCGeneral.h.in @@ -114,6 +114,6 @@ THC_API cudaError_t THCudaFree(THCState *state, void *ptr); THC_API void THCSetGCHandler(THCState *state, void (*torchGCHandlerFunction)(void *data), void *data ); -THC_API void THCHeapUpdate(THCState *state, long size); +THC_API void THCHeapUpdate(THCState *state, ptrdiff_t size); #endif diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index 7777bf7..5a77293 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -10,7 +10,7 @@ struct __float2halfOp { __device__ half operator()(float v) { return __float2half(v); } }; -void THCFloat2Half(THCState *state, half *out, float *in, long len) { +void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len) { thrust::transform( #if CUDA_VERSION >= 7000 thrust::cuda::par.on(THCState_getCurrentStream(state)), @@ -20,7 +20,7 @@ void THCFloat2Half(THCState *state, half *out, float *in, long len) { in, in + len, out, __float2halfOp()); } -void THCHalf2Float(THCState *state, float *out, half *in, long len) { +void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len) { thrust::transform( #if CUDA_VERSION >= 7000 thrust::cuda::par.on(THCState_getCurrentStream(state)), diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index ec6d83f..795874e 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -18,8 +18,8 @@ #include <cuda_fp16.h> #include <stdint.h> -THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, long len); -THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len); +THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len); +THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len); THC_API half THC_float2half(float a); THC_API float THC_half2float(half a); diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh index a7135de..7f276a2 100644 --- a/lib/THC/THCReduce.cuh +++ b/lib/THC/THCReduce.cuh @@ -123,7 +123,7 @@ inline dim3 getNoncontigReduceBlock() { return dim3(THC_NONCONTIG_REDUCE_BLOCK_SIZE); } -inline dim3 getContigReduceBlock(long numSlices, long reductionSize) { +inline dim3 getContigReduceBlock(ptrdiff_t numSlices, long reductionSize) { // If the number of slices is low but the reduction dimension size // is high, then we should increase block size for greater parallelism. // Aim for at least 32 warps per SM (assume 15 SMs; don't bother @@ -148,13 +148,13 @@ inline dim3 getContigReduceBlock(long numSlices, long reductionSize) { return dim3(numWarps * 32); } -inline bool getNoncontigReduceGrid(long elements, dim3& grid) { +inline bool getNoncontigReduceGrid(ptrdiff_t elements, dim3& grid) { // One output point per thread return THC_getGridFromTiles(THCCeilDiv(elements, - (long) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid); + (ptrdiff_t) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid); } -inline bool getContigReduceGrid(long elements, dim3& grid) { +inline bool getContigReduceGrid(ptrdiff_t elements, dim3& grid) { // One output point per block return THC_getGridFromTiles(elements, grid); } @@ -169,11 +169,11 @@ bool THC_reduceDim(THCState* state, const ReduceOp& reduceOp, typename TensorUtils<TensorType>::DataType init, int dim) { - long inElements = TensorUtils<TensorType>::getNumElements(state, in); + ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in); long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim); long reductionStride = TensorUtils<TensorType>::getStride(state, in, dim); - long outElements = inElements / reductionSize; + ptrdiff_t outElements = inElements / reductionSize; if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS || TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) { diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh index 498fb53..a9cea84 100644 --- a/lib/THC/THCReduceAll.cuh +++ b/lib/THC/THCReduceAll.cuh @@ -124,16 +124,16 @@ kernelReduceAllPass2(int numPass1Blocks, // Perform a two-pass reduction if the tensor is large enough to // warrant it. -inline bool isTwoPassReductionSize(long elements) { +inline bool isTwoPassReductionSize(ptrdiff_t elements) { return (elements > THC_TWO_PASS_REDUCTION_SIZE); } template <typename InT, typename AccT> -inline long getTwoPassBlocks(THCState* state, long elements) { - long numBlocks = THCCeilDiv(elements, THC_REDUCE_ALL_BLOCK_SIZE); +inline ptrdiff_t getTwoPassBlocks(THCState* state, ptrdiff_t elements) { + ptrdiff_t numBlocks = THCCeilDiv(elements, (ptrdiff_t)THC_REDUCE_ALL_BLOCK_SIZE); // We can only have as many blocks as there is scratch space - long scratchSpace = + ptrdiff_t scratchSpace = THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(AccT); THAssert(scratchSpace > 0); @@ -146,14 +146,14 @@ inline long getTwoPassBlocks(THCState* state, long elements) { // Get the block/grid size that we want template <typename InT, typename AccT> -inline void getPass1ReduceBlockGrid(THCState* state, long elements, +inline void getPass1ReduceBlockGrid(THCState* state, ptrdiff_t elements, dim3& grid, dim3& block) { grid = dim3(getTwoPassBlocks<InT, AccT>(state, elements)); block = dim3(THC_REDUCE_ALL_BLOCK_SIZE); } template <typename InT, typename AccT> -inline void getPass2ReduceBlockGrid(THCState* state, long elements, +inline void getPass2ReduceBlockGrid(THCState* state, ptrdiff_t elements, dim3& grid, dim3& block) { grid = dim3(1); // We only need as many threads as there were blocks originally @@ -161,7 +161,7 @@ inline void getPass2ReduceBlockGrid(THCState* state, long elements, } template <typename InT, typename AccT> -inline void getSinglePassReduceBlockGrid(long elements, +inline void getSinglePassReduceBlockGrid(ptrdiff_t elements, dim3& grid, dim3& block) { grid = dim3(1); block = dim3(THC_REDUCE_ALL_BLOCK_SIZE); @@ -176,7 +176,7 @@ template <typename ModifyOp, int ADims> void callReduceAll(THCState* state, const TensorInfo<InT, IndexType>& in, - long totalElements, + ptrdiff_t totalElements, AccT init, const ModifyOp& modifyOp, const ReduceOp& reduceOp, @@ -229,7 +229,7 @@ bool THC_reduceAll(THCState* state, AccT init, AccT* out, int outOnDevice) { - long inElements = TensorUtils<TensorType>::getNumElements(state, in); + ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in); if (TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) { return false; diff --git a/lib/THC/THCReduceApplyUtils.cu b/lib/THC/THCReduceApplyUtils.cu index b3c6f51..6d4c06e 100644 --- a/lib/THC/THCReduceApplyUtils.cu +++ b/lib/THC/THCReduceApplyUtils.cu @@ -11,7 +11,7 @@ void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg) { THArgCheck(dims <= MAX_CUTORCH_DIMS, arg, CUTORCH_DIM_WARNING); } -bool THC_getGridFromTiles(long gridTiles, dim3& grid) { +bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid) { if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) { return false; } @@ -21,11 +21,11 @@ bool THC_getGridFromTiles(long gridTiles, dim3& grid) { long gridZ = 1; if (gridTiles > MAX_GRID_SIZE) { - gridTiles = THCCeilDiv(gridTiles, (long) MAX_GRID_SIZE); + gridTiles = THCCeilDiv(gridTiles, (ptrdiff_t) MAX_GRID_SIZE); gridY = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; if (gridTiles > MAX_GRID_SIZE) { - gridTiles = THCCeilDiv(gridTiles, (long) MAX_GRID_SIZE); + gridTiles = THCCeilDiv(gridTiles, (ptrdiff_t) MAX_GRID_SIZE); gridZ = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; } } diff --git a/lib/THC/THCReduceApplyUtils.cuh b/lib/THC/THCReduceApplyUtils.cuh index 06f969f..e365b3a 100644 --- a/lib/THC/THCReduceApplyUtils.cuh +++ b/lib/THC/THCReduceApplyUtils.cuh @@ -76,6 +76,6 @@ __device__ T reduceBlock(T* smem, void THCCheckTensorDims(THCState* state, THCudaTensor* tensor, int arg); // Produces a grid with at least one point per tile -THC_API bool THC_getGridFromTiles(long gridTiles, dim3& grid); +THC_API bool THC_getGridFromTiles(ptrdiff_t gridTiles, dim3& grid); #endif // THC_REDUCE_APPLY_UTILS_INC diff --git a/lib/THC/THCTensorConv.cu b/lib/THC/THCTensorConv.cu index 2543d26..71aac03 100644 --- a/lib/THC/THCTensorConv.cu +++ b/lib/THC/THCTensorConv.cu @@ -354,7 +354,7 @@ THC_API void THCudaTensor_conv2Dmv(THCState *state, THCudaTensor *output, float nOutputCols = (nInputCols - nKernelCols) / scol + 1; } - long nelem = THCudaTensor_nElement(state, output); + ptrdiff_t nelem = THCudaTensor_nElement(state, output); THCudaTensor_resize3d(state, output, nOutputPlane, nOutputRows, nOutputCols); if (beta == 0 || nelem != THCudaTensor_nElement(state, output)) { @@ -475,7 +475,7 @@ THC_API void THCudaTensor_conv2Dmm(THCState *state, THCudaTensor *output, float nOutputCols = (nInputCols - nKernelCols) / scol + 1; } - long nelem = THCudaTensor_nElement(state, output); + ptrdiff_t nelem = THCudaTensor_nElement(state, output); THCudaTensor_resize4d(state, output, nbatch, nOutputPlane, nOutputRows, nOutputCols); if (beta == 0 || nelem != THCudaTensor_nElement(state, output)) { @@ -576,7 +576,7 @@ THC_API void THCudaTensor_conv2DRevger(THCState *state, THCudaTensor *output, fl nOutputRows = nInputRows - (nKernelRows - 1) * srow; nOutputCols = nInputCols - (nKernelCols - 1) * scol; - long nelem = THCudaTensor_nElement(state, output); + ptrdiff_t nelem = THCudaTensor_nElement(state, output); THCudaTensor_resize4d(state, output, nKernelPlane, nInputPlane, nOutputRows, nOutputCols); if (nelem == 0 || beta == 0 || nelem != THCudaTensor_nElement(state, output)) { @@ -649,7 +649,7 @@ THC_API void THCudaTensor_conv2DRevgerm(THCState *state, THCudaTensor *output, f nOutputRows = nInputRows - (nKernelRows - 1) * srow; nOutputCols = nInputCols - (nKernelCols - 1) * scol; - long nelem = THCudaTensor_nElement(state, output); + ptrdiff_t nelem = THCudaTensor_nElement(state, output); THCudaTensor_resize4d(state, output, nKernelPlane, nInputPlane, nOutputRows, nOutputCols); if (nelem == 0 || beta == 0 || nelem != THCudaTensor_nElement(state, output)) { @@ -912,7 +912,7 @@ THC_API void THCudaTensor_conv2Dmap(THCState *state, THCudaTensor *output, THCud nOutputRows = (nInputRows - nKernelRows) / stride_y + 1; nOutputCols = (nInputCols - nKernelCols) / stride_x + 1; - // long nelem = THCudaTensor_nElement(state, output); + // ptrdiff_t nelem = THCudaTensor_nElement(state, output); THCudaTensor_resize3d(state, output, nOutputPlane, nOutputRows, nOutputCols); float *input_data = THCudaTensor_data(state, input); diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu index d5aae4f..b0001c5 100644 --- a/lib/THC/THCTensorCopy.cu +++ b/lib/THC/THCTensorCopy.cu @@ -24,7 +24,7 @@ struct CopyOp { template <typename TensorTypeDst, typename TensorTypeSrc> void THC_copyTensor(THCState* state, TensorTypeDst* dst, TensorTypeSrc* src) { - long totalElements = TensorUtils<TensorTypeDst>::getNumElements(state, dst); + ptrdiff_t totalElements = TensorUtils<TensorTypeDst>::getNumElements(state, dst); THArgCheck(totalElements == TensorUtils<TensorTypeSrc>::getNumElements(state, src), diff --git a/lib/THC/THCTensorIndex.cu b/lib/THC/THCTensorIndex.cu index 8bf1a88..415e625 100644 --- a/lib/THC/THCTensorIndex.cu +++ b/lib/THC/THCTensorIndex.cu @@ -9,6 +9,7 @@ #include "THCReduce.cuh" #include "THCDeviceUtils.cuh" #include "THCNumerics.cuh" +#include "THCAtomics.cuh" #include <algorithm> // for std::min // We prefer this kernel to avoid reloading index points if the number @@ -96,131 +97,6 @@ __global__ void indexCopyLargeIndex(TensorInfo<T, IndexType> dst, } } -template <typename T, size_t n> -struct AtomicAddIntegerImpl; - -template<typename T> -struct AtomicAddIntegerImpl<T, 1> { - __device__ void operator()(T *address, T val) { - unsigned int * address_as_ui = - (unsigned int *) (address - ((size_t)address & 3)); - unsigned int old = *address_as_ui; - unsigned int shift = (((size_t)address & 3) * 8); - unsigned int sum; - unsigned int assumed; - - do { - assumed = old; - sum = val + T((old >> shift) & 0xff); - old = (old & ~(0x000000ff << shift)) | (sum << shift); - old = atomicCAS(address_as_ui, assumed, old); - } while (assumed != old); - } -}; - -template<typename T> -struct AtomicAddIntegerImpl<T, 2> { - __device__ void operator()(T *address, T val) { - unsigned int * address_as_ui = - (unsigned int *) ((char *)address - ((size_t)address & 2)); - unsigned int old = *address_as_ui; - unsigned int sum; - unsigned int newval; - unsigned int assumed; - - do { - assumed = old; - sum = val + (size_t)address & 2 ? T(old >> 16) : T(old & 0xffff); - newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16) : (old & 0xffff0000) | sum; - old = atomicCAS(address_as_ui, assumed, newval); - } while (assumed != old); - } -}; - -template<typename T> -struct AtomicAddIntegerImpl<T, 4> { - __device__ void operator()(T *address, T val) { - unsigned int * address_as_ui = (unsigned int *) (address); - unsigned int old = *address_as_ui; - unsigned int newval; - unsigned int assumed; - - do { - assumed = old; - newval = val + (T)old; - old = atomicCAS(address_as_ui, assumed, newval); - } while (assumed != old); - } -}; - -template<typename T> -struct AtomicAddIntegerImpl<T, 8> { - __device__ void operator()(T *address, T val) { - unsigned long long * address_as_ui = (unsigned long long *) (address); - unsigned long long old = *address_as_ui; - unsigned long long newval; - unsigned long long assumed; - - do { - assumed = old; - newval = val + (T)old; - old = atomicCAS(address_as_ui, assumed, newval); - } while (assumed != old); - } -}; - -__device__ void atomicAdd(unsigned char *address, unsigned char val) { - AtomicAddIntegerImpl<unsigned char, sizeof(unsigned char)>()(address, val); -} - -__device__ void atomicAdd(char *address, char val) { - AtomicAddIntegerImpl<char, sizeof(char)>()(address, val); -} - -__device__ void atomicAdd(short *address, short val) { - AtomicAddIntegerImpl<short, sizeof(short)>()(address, val); -} - -__device__ void atomicAdd(long *address, long val) { - AtomicAddIntegerImpl<long, sizeof(long)>()(address, val); -} - -#ifdef CUDA_HALF_TENSOR -__device__ void atomicAdd(half *address, half val) { - unsigned int * address_as_ui = - (unsigned int *) ((char *)address - ((size_t)address & 2)); - unsigned int old = *address_as_ui; - unsigned int assumed; - - do { - assumed = old; - half hsum; - hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - hsum = THCNumerics<half>::add(hsum, val); - old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; - old = atomicCAS(address_as_ui, assumed, old); - } while (assumed != old); -} -#endif - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 -// from CUDA C Programmic Guide -__device__ void atomicAdd(double *address, double val) { - unsigned long long int* address_as_ull = (unsigned long long int*)address; - unsigned long long int old = *address_as_ull; - unsigned long long int assumed; - - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + - __longlong_as_double(assumed))); - - // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) - } while (assumed != old); -} -#endif - // We prefer this kernel to avoid reloading index points if the number // of indices is a small number. // This kernel in fact works for all choices of problem size, but if diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu index afd262d..d1fe328 100644 --- a/lib/THC/THCTensorMath2.cu +++ b/lib/THC/THCTensorMath2.cu @@ -72,7 +72,7 @@ float THCudaTensor_dist(THCState *state, THCudaTensor *self, THCudaTensor *src, { THAssert(THCudaTensor_checkGPU(state, 2, self, src)); self = THCudaTensor_newContiguous(state, self); - long size = THCudaTensor_nElement(state, self); + ptrdiff_t size = THCudaTensor_nElement(state, self); src = THCudaTensor_newContiguous(state, src); thrust::device_ptr<float> self_data(THCudaTensor_data(state, self)); thrust::device_ptr<float> src_data(THCudaTensor_data(state, src)); @@ -103,4 +103,3 @@ void THCudaTensor_randn(THCState *state, THCudaTensor *r_, THLongStorage *size) THCudaTensor_resize(state, r_, size, NULL); THCudaTensor_normal(state, r_, 0, 1); } - diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu index e7d0573..2695f2d 100644 --- a/lib/THC/THCTensorMathPairwise.cu +++ b/lib/THC/THCTensorMathPairwise.cu @@ -344,21 +344,21 @@ void THCudaTensor_triu(THCState *state, THCudaTensor *self_, THCudaTensor *src_, #include "THCGenerateAllTypes.h" // Copy the kth diagonal of a matrix B to a vector A. -__global__ void THCudaTensor_copyFromDiagonal(float* a, float* b, long start, long size, long strideSum, long strideA) { - for (long linearIndex = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void THCudaTensor_copyFromDiagonal(float* a, float* b, ptrdiff_t start, ptrdiff_t size, ptrdiff_t strideSum, ptrdiff_t strideA) { + for (ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < size; linearIndex += gridDim.x * blockDim.x) { - const long bOffset = start + strideSum * linearIndex; + const ptrdiff_t bOffset = start + strideSum * linearIndex; a[strideA * linearIndex] = b[bOffset]; } } // Copy vector B to the kth diagonal of a matrix A -__global__ void THCudaTensor_copyToDiagonal(float* a, float* b, long start, long size, long strideSum, long strideB) { - for (long linearIndex = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void THCudaTensor_copyToDiagonal(float* a, float* b, ptrdiff_t start, ptrdiff_t size, ptrdiff_t strideSum, ptrdiff_t strideB) { + for (ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < size; linearIndex += gridDim.x * blockDim.x) { - const long aOffset = start + strideSum * linearIndex; + const ptrdiff_t aOffset = start + strideSum * linearIndex; a[aOffset] = b[strideB * linearIndex]; } } @@ -381,16 +381,16 @@ void THCudaTensor_diag(THCState *state, THCudaTensor *self_, THCudaTensor *src_, THCudaTensor_copyFromDiagonal<<<grid, threads, 0, THCState_getCurrentStream(state)>>> (THCudaTensor_data(state, self_), THCudaTensor_data(state, src_), start, size, stride0 + stride1, strideSelf); } else { - long totalElements = THCudaTensor_nElement(state, src_); - long size = (k > 0) ? totalElements + k : totalElements - k; + ptrdiff_t totalElements = THCudaTensor_nElement(state, src_); + ptrdiff_t size = (k > 0) ? totalElements + k : totalElements - k; long strideSrc = THCudaTensor_stride(state, src_, 0); THCudaTensor_resize2d(state, self_, size, size); THCudaTensor_zero(state, self_); long stride0 = THCudaTensor_stride(state, self_, 0); long stride1 = THCudaTensor_stride(state, self_, 1); const dim3 threads(min((long long)THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock, (long long)size)); - dim3 grid(min((long long)1024, (long long)THCCeilDiv(size, (long)threads.x))); - long start = (k >= 0 ? k * stride1 : -k * stride0); + dim3 grid(min((long long)1024, (long long)THCCeilDiv(size, (ptrdiff_t)threads.x))); + ptrdiff_t start = (k >= 0 ? k * stride1 : -k * stride0); THCudaTensor_copyToDiagonal<<<grid, threads, 0, THCState_getCurrentStream(state)>>> (THCudaTensor_data(state, self_), THCudaTensor_data(state, src_), start, totalElements, stride0 + stride1, strideSrc); } diff --git a/lib/THC/THCTensorMathReduce.cuh b/lib/THC/THCTensorMathReduce.cuh index 8e368be..db2e424 100644 --- a/lib/THC/THCTensorMathReduce.cuh +++ b/lib/THC/THCTensorMathReduce.cuh @@ -123,7 +123,7 @@ struct LogicalAny { }; template<typename Real> -__global__ void THCTensor_kernel_renorm(Real *data, const Real value, const long size, const Real maxnorm) +__global__ void THCTensor_kernel_renorm(Real *data, const Real value, const ptrdiff_t size, const Real maxnorm) { __shared__ Real buffer[32]; long tx = threadIdx.x; @@ -134,7 +134,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const long buffer[tx] = ScalarConvert<int, Real>::to(0); // get norm of axis - for (long i=tx; i<size; i+=step) + for (ptrdiff_t i=tx; i<size; i+=step) { buffer[tx] = THCNumerics<Real>::add( buffer[tx], @@ -163,7 +163,7 @@ __global__ void THCTensor_kernel_renorm(Real *data, const Real value, const long ) ); // renormalize - for (long i=tx; i<size; i+=step) + for (ptrdiff_t i=tx; i<size; i+=step) { row[i] = THCNumerics<Real>::mul(row[i], norm); } @@ -326,7 +326,7 @@ __host__ void THCTensor_varOuterDim(THCState *state, TensorTypeK *tgt, TensorTyp unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src); // Treat all outer dimensions (i.e. dim < dimension) as one. unsigned num_orows = 1; - for (unsigned dim = 0; dim < dimension; dim++) { + for (long dim = 0; dim < dimension; dim++) { num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim); } unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, dimension); diff --git a/lib/THC/THCTensorRandom.cu b/lib/THC/THCTensorRandom.cu index 769a8ba..05b41b3 100644 --- a/lib/THC/THCTensorRandom.cu +++ b/lib/THC/THCTensorRandom.cu @@ -238,13 +238,13 @@ __global__ void generate_log_normal(curandStateMtgp32 *state, int size, float *r } } -#define NUM_BLOCKS min((int)THCCeilDiv(size, (long) BLOCK_SIZE), MAX_NUM_BLOCKS) +#define NUM_BLOCKS min(THCCeilDiv(size, (ptrdiff_t) BLOCK_SIZE), (ptrdiff_t) MAX_NUM_BLOCKS) THC_API void THCudaTensor_uniform(THCState* state, THCudaTensor *self_, double a, double b) { THAssert(THCudaTensor_checkGPU(state, 1, self_)); Generator* gen = THCRandom_getGenerator(state); THCudaTensor *self = THCudaTensor_newContiguous(state, self_); - long size = THCudaTensor_nElement(state, self); + ptrdiff_t size = THCudaTensor_nElement(state, self); float *data = THCudaTensor_data(state, self); generate_uniform<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( @@ -258,7 +258,7 @@ THC_API void THCudaTensor_bernoulli(THCState* state, THCudaTensor *self_, double THAssert(THCudaTensor_checkGPU(state, 1, self_)); Generator* gen = THCRandom_getGenerator(state); THCudaTensor *self = THCudaTensor_newContiguous(state, self_); - long size = THCudaTensor_nElement(state, self); + ptrdiff_t size = THCudaTensor_nElement(state, self); float *data = THCudaTensor_data(state, self); generate_bernoulli<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( @@ -272,7 +272,7 @@ THC_API void THCudaTensor_normal(THCState* state, THCudaTensor *self_, double me THAssert(THCudaTensor_checkGPU(state, 1, self_)); Generator* gen = THCRandom_getGenerator(state); THCudaTensor *self = THCudaTensor_newContiguous(state, self_); - long size = THCudaTensor_nElement(state, self); + ptrdiff_t size = THCudaTensor_nElement(state, self); float *data = THCudaTensor_data(state, self); generate_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( @@ -287,7 +287,7 @@ THC_API void THCudaTensor_logNormal(THCState* state, THCudaTensor *self_, double Generator* gen = THCRandom_getGenerator(state); THCudaTensor *self = THCudaTensor_newContiguous(state, self_); - long size = THCudaTensor_nElement(state, self); + ptrdiff_t size = THCudaTensor_nElement(state, self); float *data = THCudaTensor_data(state, self); generate_log_normal<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( @@ -302,7 +302,7 @@ THC_API void THCudaTensor_geometric(THCState* state, THCudaTensor *self_, double Generator* gen = THCRandom_getGenerator(state); THCudaTensor *self = THCudaTensor_newContiguous(state, self_); - long size = THCudaTensor_nElement(state, self); + ptrdiff_t size = THCudaTensor_nElement(state, self); float *data = THCudaTensor_data(state, self); generate_geometric<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( @@ -317,7 +317,7 @@ THC_API void THCudaTensor_exponential(THCState* state, THCudaTensor *self_, doub Generator* gen = THCRandom_getGenerator(state); THCudaTensor *self = THCudaTensor_newContiguous(state, self_); - long size = THCudaTensor_nElement(state, self); + ptrdiff_t size = THCudaTensor_nElement(state, self); float *data = THCudaTensor_data(state, self); generate_exponential<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( @@ -332,7 +332,7 @@ THC_API void THCudaTensor_cauchy(THCState* state, THCudaTensor *self_, double me Generator* gen = THCRandom_getGenerator(state); THCudaTensor *self = THCudaTensor_newContiguous(state, self_); - long size = THCudaTensor_nElement(state, self); + ptrdiff_t size = THCudaTensor_nElement(state, self); float *data = THCudaTensor_data(state, self); generate_cauchy<<<NUM_BLOCKS, BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( @@ -704,7 +704,7 @@ THC_API void THCudaTensor_multinomial(struct THCState *state, // Each warp in a block will generate a sample from a different // distribution concurrently. - long numBlocks = THCCeilDiv(numDist, 4L); + ptrdiff_t numBlocks = THCCeilDiv(numDist, 4L); dim3 grid(numBlocks < MAX_NUM_BLOCKS ? numBlocks : MAX_NUM_BLOCKS); for (int sample = 0; sample < n_sample; ++sample) { diff --git a/lib/THC/THCTensorSort.cu b/lib/THC/THCTensorSort.cu index f2dd0e9..589d3e9 100644 --- a/lib/THC/THCTensorSort.cu +++ b/lib/THC/THCTensorSort.cu @@ -22,9 +22,9 @@ void THCudaLongTensor_fillSliceWithIndex(THCState* state, long dims = THCudaLongTensor_nDimension(state, t); THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); - long inElements = THCudaLongTensor_nElement(state, t); + ptrdiff_t inElements = THCudaLongTensor_nElement(state, t); long sliceSize = THCudaLongTensor_size(state, t, dim); - long numSlices = inElements / sliceSize; + ptrdiff_t numSlices = inElements / sliceSize; dim3 grid; if (!THC_getGridFromTiles(numSlices, grid)) { diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu index 96cd3bc..a273a72 100644 --- a/lib/THC/THCTensorTypeUtils.cu +++ b/lib/THC/THCTensorTypeUtils.cu @@ -80,7 +80,7 @@ TensorUtils<TENSOR_TYPE>::getData(THCState* state, \ return (DATA_TYPE*) TENSOR_TYPE##_data(state, t); \ } \ \ -long \ +ptrdiff_t \ TensorUtils<TENSOR_TYPE>::getNumElements(THCState* state, \ TENSOR_TYPE* t) { \ return TENSOR_TYPE##_nElement(state, t); \ @@ -185,18 +185,18 @@ TensorUtils<TENSOR_TYPE>::overlappingIndices(THCState* state, \ bool \ TensorUtils<TENSOR_TYPE>::canUse32BitIndexMath(THCState* state, \ TENSOR_TYPE* t) { \ - long elements = TensorUtils<TENSOR_TYPE>::getNumElements(state, t); \ + ptrdiff_t elements = TensorUtils<TENSOR_TYPE>::getNumElements(state, t); \ if (elements >= UINT_MAX) { \ return false; \ } \ \ - long offset = 0; \ - long linearId = elements - 1; \ + ptrdiff_t offset = 0; \ + ptrdiff_t linearId = elements - 1; \ \ for (int i = TensorUtils<TENSOR_TYPE>::getDims(state, t) - 1; i >= 0; --i) { \ - long curDimIndex = \ + ptrdiff_t curDimIndex = \ linearId % TensorUtils<TENSOR_TYPE>::getSize(state, t, i); \ - long curDimOffset = curDimIndex * \ + ptrdiff_t curDimOffset = curDimIndex * \ TensorUtils<TENSOR_TYPE>::getStride(state, t, i); \ offset += curDimOffset; \ linearId /= TensorUtils<TENSOR_TYPE>::getSize(state, t, i); \ diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh index 22a2f92..4f5d516 100644 --- a/lib/THC/THCTensorTypeUtils.cuh +++ b/lib/THC/THCTensorTypeUtils.cuh @@ -50,7 +50,7 @@ struct TensorUtils { static void resizeAs(THCState* state, TENSOR_TYPE* dst, \ TENSOR_TYPE* src); \ static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t); \ - static long getNumElements(THCState* state, TENSOR_TYPE* t); \ + static ptrdiff_t getNumElements(THCState* state, TENSOR_TYPE* t); \ static long getSize(THCState* state, TENSOR_TYPE* t, int dim); \ static long getStride(THCState* state, TENSOR_TYPE* t, int dim); \ static int getDims(THCState* state, TENSOR_TYPE* t); \ diff --git a/lib/THC/generic/THCStorage.c b/lib/THC/generic/THCStorage.c index 61ba125..ad68526 100644 --- a/lib/THC/generic/THCStorage.c +++ b/lib/THC/generic/THCStorage.c @@ -7,7 +7,7 @@ real* THCStorage_(data)(THCState *state, const THCStorage *self) return self->data; } -long THCStorage_(size)(THCState *state, const THCStorage *self) +ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage *self) { return self->size; } @@ -17,13 +17,13 @@ int THCStorage_(elementSize)(THCState *state) return sizeof(real); } -void THCStorage_(set)(THCState *state, THCStorage *self, long index, real value) +void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, real value) { THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THCudaCheck(cudaMemcpy(self->data + index, &value, sizeof(real), cudaMemcpyHostToDevice)); } -real THCStorage_(get)(THCState *state, const THCStorage *self, long index) +real THCStorage_(get)(THCState *state, const THCStorage *self, ptrdiff_t index) { THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); real value; @@ -41,7 +41,7 @@ THCStorage* THCStorage_(new)(THCState *state) return storage; } -THCStorage* THCStorage_(newWithSize)(THCState *state, long size) +THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size) { THArgCheck(size >= 0, 2, "invalid size"); @@ -103,13 +103,13 @@ THCStorage* THCStorage_(newWithSize4)(THCState *state, real data0, real data1, r return self; } -THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fileName, long size, int isShared) +THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *fileName, ptrdiff_t size, int isShared) { THError("not available yet for THCStorage"); return NULL; } -THCStorage* THCStorage_(newWithData)(THCState *state, real *data, long size) +THCStorage* THCStorage_(newWithData)(THCState *state, real *data, ptrdiff_t size) { THCStorage *storage = (THCStorage*)THAlloc(sizeof(THCStorage)); storage->data = data; diff --git a/lib/THC/generic/THCStorage.cu b/lib/THC/generic/THCStorage.cu index a6bb29a..63bccd7 100644 --- a/lib/THC/generic/THCStorage.cu +++ b/lib/THC/generic/THCStorage.cu @@ -12,7 +12,7 @@ void THCStorage_(fill)(THCState *state, THCStorage *self, real value) self_data, self_data+self->size, value); } -void THCStorage_(resize)(THCState *state, THCStorage *self, long size) +void THCStorage_(resize)(THCState *state, THCStorage *self, ptrdiff_t size) { THArgCheck(size >= 0, 2, "invalid size"); diff --git a/lib/THC/generic/THCStorage.h b/lib/THC/generic/THCStorage.h index 194ad62..a46caad 100644 --- a/lib/THC/generic/THCStorage.h +++ b/lib/THC/generic/THCStorage.h @@ -9,7 +9,7 @@ typedef struct THCStorage { real *data; - long size; + ptrdiff_t size; int refcount; char flag; THAllocator *allocator; @@ -19,36 +19,36 @@ typedef struct THCStorage THC_API real* THCStorage_(data)(THCState *state, const THCStorage*); -THC_API long THCStorage_(size)(THCState *state, const THCStorage*); +THC_API ptrdiff_t THCStorage_(size)(THCState *state, const THCStorage*); THC_API int THCStorage_(elementSize)(THCState *state); /* slow access -- checks everything */ -THC_API void THCStorage_(set)(THCState *state, THCStorage*, long, real); -THC_API real THCStorage_(get)(THCState *state, const THCStorage*, long); +THC_API void THCStorage_(set)(THCState *state, THCStorage*, ptrdiff_t, real); +THC_API real THCStorage_(get)(THCState *state, const THCStorage*, ptrdiff_t); THC_API THCStorage* THCStorage_(new)(THCState *state); -THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, long size); +THC_API THCStorage* THCStorage_(newWithSize)(THCState *state, ptrdiff_t size); THC_API THCStorage* THCStorage_(newWithSize1)(THCState *state, real); THC_API THCStorage* THCStorage_(newWithSize2)(THCState *state, real, real); THC_API THCStorage* THCStorage_(newWithSize3)(THCState *state, real, real, real); THC_API THCStorage* THCStorage_(newWithSize4)(THCState *state, real, real, real, real); -THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, long size, int shared); +THC_API THCStorage* THCStorage_(newWithMapping)(THCState *state, const char *filename, ptrdiff_t size, int shared); /* takes ownership of data */ -THC_API THCStorage* THCStorage_(newWithData)(THCState *state, real *data, long size); +THC_API THCStorage* THCStorage_(newWithData)(THCState *state, real *data, ptrdiff_t size); -THC_API THCStorage* THCStorage_(newWithAllocator)(THCState *state, long size, +THC_API THCStorage* THCStorage_(newWithAllocator)(THCState *state, ptrdiff_t size, THAllocator* allocator, void *allocatorContext); THC_API THCStorage* THCStorage_(newWithDataAndAllocator)( - THCState *state, real* data, long size, THAllocator* allocator, void *allocatorContext); + THCState *state, real* data, ptrdiff_t size, THAllocator* allocator, void *allocatorContext); THC_API void THCStorage_(setFlag)(THCState *state, THCStorage *storage, const char flag); THC_API void THCStorage_(clearFlag)(THCState *state, THCStorage *storage, const char flag); THC_API void THCStorage_(retain)(THCState *state, THCStorage *storage); THC_API void THCStorage_(free)(THCState *state, THCStorage *storage); -THC_API void THCStorage_(resize)(THCState *state, THCStorage *storage, long size); +THC_API void THCStorage_(resize)(THCState *state, THCStorage *storage, ptrdiff_t size); THC_API void THCStorage_(fill)(THCState *state, THCStorage *storage, real value); THC_API int THCStorage_(getDevice)(THCState* state, const THCStorage* storage); diff --git a/lib/THC/generic/THCTensor.c b/lib/THC/generic/THCTensor.c index e18044d..f6c82b5 100644 --- a/lib/THC/generic/THCTensor.c +++ b/lib/THC/generic/THCTensor.c @@ -8,7 +8,7 @@ THCStorage *THCTensor_(storage)(THCState *state, const THCTensor *self) return self->storage; } -long THCTensor_(storageOffset)(THCState *state, const THCTensor *self) +ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self) { return self->storageOffset; } @@ -65,7 +65,7 @@ void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char flag) /**** creation methods ****/ static void THCTensor_(rawInit)(THCState *state, THCTensor *self); -static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, long storageOffset, int nDimension, long *size, long *stride); +static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride); /* Empty init */ @@ -92,7 +92,7 @@ THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor) } /* Storage init */ -THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, long storageOffset, THLongStorage *size, THLongStorage *stride) +THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride) { THCTensor *self = (THCTensor*)THAlloc(sizeof(THCTensor)); if(size && stride) @@ -109,20 +109,20 @@ THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage, long return self; } -THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage, long storageOffset, +THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, long size0, long stride0) { return THCTensor_(newWithStorage4d)(state, storage, storageOffset, size0, stride0, -1, -1, -1, -1, -1, -1); } -THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage, long storageOffset, +THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, long size0, long stride0, long size1, long stride1) { return THCTensor_(newWithStorage4d)(state, storage, storageOffset, size0, stride0, size1, stride1, -1, -1, -1, -1); } -THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage, long storageOffset, +THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, long size0, long stride0, long size1, long stride1, long size2, long stride2) @@ -130,7 +130,7 @@ THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage, lo return THCTensor_(newWithStorage4d)(state, storage, storageOffset, size0, stride0, size1, stride1, size2, stride2, -1, -1); } -THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage, long storageOffset, +THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage, ptrdiff_t storageOffset, long size0, long stride0, long size1, long stride1, long size2, long stride2, @@ -296,7 +296,7 @@ void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src) src->stride); } -void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_) +void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_) { if(size_ && stride_) THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes"); @@ -310,7 +310,7 @@ void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storag (stride_ ? stride_->data : NULL)); } -void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, +void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_) { THCTensor_(setStorage4d)(state, self, storage_, storageOffset_, @@ -320,7 +320,7 @@ void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *stor -1, -1); } -void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, +void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_, long size1_, long stride1_) { @@ -331,7 +331,7 @@ void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *stor -1, -1); } -void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, +void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_, long size1_, long stride1_, long size2_, long stride2_) @@ -343,7 +343,7 @@ void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *stor -1, -1); } -void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, +void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_, long size1_, long stride1_, long size2_, long stride2_, @@ -578,13 +578,13 @@ int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTe return 1; } -long THCTensor_(nElement)(THCState *state, const THCTensor *self) +ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self) { if(self->nDimension == 0) return 0; else { - long nElement = 1; + ptrdiff_t nElement = 1; int d; for(d = 0; d < self->nDimension; d++) nElement *= self->size[d]; @@ -637,7 +637,7 @@ static void THCTensor_(rawInit)(THCState *state, THCTensor *self) self->flag = TH_TENSOR_REFCOUNTED; } -static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, long storageOffset, int nDimension, long *size, long *stride) +static void THCTensor_(rawSet)(THCState *state, THCTensor *self, THCStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride) { /* storage */ if(self->storage != storage) @@ -667,7 +667,7 @@ void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, lon { int d; int nDimension_; - long totalSize; + ptrdiff_t totalSize; int hascorrectsize = 1; nDimension_ = 0; diff --git a/lib/THC/generic/THCTensor.h b/lib/THC/generic/THCTensor.h index 175eaee..3c5291e 100644 --- a/lib/THC/generic/THCTensor.h +++ b/lib/THC/generic/THCTensor.h @@ -11,7 +11,7 @@ typedef struct THCTensor int nDimension; THCStorage *storage; - long storageOffset; + ptrdiff_t storageOffset; int refcount; char flag; @@ -21,7 +21,7 @@ typedef struct THCTensor /**** access methods ****/ THC_API THCStorage* THCTensor_(storage)(THCState *state, const THCTensor *self); -THC_API long THCTensor_(storageOffset)(THCState *state, const THCTensor *self); +THC_API ptrdiff_t THCTensor_(storageOffset)(THCState *state, const THCTensor *self); THC_API int THCTensor_(nDimension)(THCState *state, const THCTensor *self); THC_API long THCTensor_(size)(THCState *state, const THCTensor *self, int dim); THC_API long THCTensor_(stride)(THCState *state, const THCTensor *self, int dim); @@ -37,17 +37,17 @@ THC_API void THCTensor_(clearFlag)(THCState *state, THCTensor *self, const char THC_API THCTensor *THCTensor_(new)(THCState *state); THC_API THCTensor *THCTensor_(newWithTensor)(THCState *state, THCTensor *tensor); /* stride might be NULL */ -THC_API THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_); -THC_API THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage_, long storageOffset_, +THC_API THCTensor *THCTensor_(newWithStorage)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_); +THC_API THCTensor *THCTensor_(newWithStorage1d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_); -THC_API THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage_, long storageOffset_, +THC_API THCTensor *THCTensor_(newWithStorage2d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_, long size1_, long stride1_); -THC_API THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage_, long storageOffset_, +THC_API THCTensor *THCTensor_(newWithStorage3d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_, long size1_, long stride1_, long size2_, long stride2_); -THC_API THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage_, long storageOffset_, +THC_API THCTensor *THCTensor_(newWithStorage4d)(THCState *state, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_, long size1_, long stride1_, long size2_, long stride2_, @@ -77,17 +77,17 @@ THC_API void THCTensor_(resize5d)(THCState *state, THCTensor *tensor, long size0 THC_API void THCTensor_(rawResize)(THCState *state, THCTensor *self, int nDimension, long *size, long *stride); THC_API void THCTensor_(set)(THCState *state, THCTensor *self, THCTensor *src); -THC_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_); -THC_API void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, +THC_API void THCTensor_(setStorage)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_); +THC_API void THCTensor_(setStorage1d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_); -THC_API void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, +THC_API void THCTensor_(setStorage2d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_, long size1_, long stride1_); -THC_API void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, +THC_API void THCTensor_(setStorage3d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_, long size1_, long stride1_, long size2_, long stride2_); -THC_API void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, long storageOffset_, +THC_API void THCTensor_(setStorage4d)(THCState *state, THCTensor *self, THCStorage *storage_, ptrdiff_t storageOffset_, long size0_, long stride0_, long size1_, long stride1_, long size2_, long stride2_, @@ -105,7 +105,7 @@ THC_API int THCTensor_(isContiguous)(THCState *state, const THCTensor *self); THC_API int THCTensor_(isSameSizeAs)(THCState *state, const THCTensor *self, const THCTensor *src); THC_API int THCTensor_(isSetTo)(THCState *state, const THCTensor *self, const THCTensor *src); THC_API int THCTensor_(isSize)(THCState *state, const THCTensor *self, const THLongStorage *dims); -THC_API long THCTensor_(nElement)(THCState *state, const THCTensor *self); +THC_API ptrdiff_t THCTensor_(nElement)(THCState *state, const THCTensor *self); THC_API void THCTensor_(retain)(THCState *state, THCTensor *self); THC_API void THCTensor_(free)(THCState *state, THCTensor *self); diff --git a/lib/THC/generic/THCTensorIndex.cu b/lib/THC/generic/THCTensorIndex.cu index e17cebf..ce4c790 100644 --- a/lib/THC/generic/THCTensorIndex.cu +++ b/lib/THC/generic/THCTensorIndex.cu @@ -26,7 +26,7 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT dims = THCudaLongTensor_nDimension(state, indices); THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING); - long numIndices = THCudaLongTensor_nElement(state, indices); + ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices); long srcDims = THCTensor_(nDimension)(state, src); cudaStream_t stream = THCState_getCurrentStream(state); @@ -44,9 +44,9 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT // total size of the tensor ignoring dimension `dim`; // -the number of indices we are choosing, which is the total size // of the tensor `indices`. - long srcTotalSize = THCTensor_(nElement)(state, src); + ptrdiff_t srcTotalSize = THCTensor_(nElement)(state, src); long dstCopyDimSize = THCTensor_(size)(state, dst, dim); - long sliceSize = srcTotalSize / numIndices; + ptrdiff_t sliceSize = srcTotalSize / numIndices; int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; @@ -62,11 +62,11 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT dstInfo, srcInfo, indicesInfo, \ dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize); - dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8))); - dim3 smallIndexBlock(std::min(sliceSize, 128L)); + dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); - dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, 128L), (long)(mpc * 8))); - dim3 largeIndexBlock(std::min(srcTotalSize, 128L)); + dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128)); if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) && TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) && @@ -154,7 +154,7 @@ void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTe dims = THCudaLongTensor_nDimension(state, indices); THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING); - long numIndices = THCudaLongTensor_nElement(state, indices); + ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices); long srcDims = THCTensor_(nDimension)(state, src); cudaStream_t stream = THCState_getCurrentStream(state); @@ -172,9 +172,9 @@ void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTe // total size of the tensor ignoring dimension `dim`; // -the number of indices we are choosing, which is the total size // of the tensor `indices`. - long srcTotalSize = THCTensor_(nElement)(state, src); + ptrdiff_t srcTotalSize = THCTensor_(nElement)(state, src); long dstAddDimSize = THCTensor_(size)(state, dst, dim); - long sliceSize = srcTotalSize / numIndices; + ptrdiff_t sliceSize = srcTotalSize / numIndices; int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; @@ -190,11 +190,11 @@ void THCTensor_(indexAdd)(THCState *state, THCTensor *dst, int dim, THCudaLongTe dstInfo, srcInfo, indicesInfo, \ dstAddDim, srcAddDim, sliceSize, dstAddDimSize); - dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8))); - dim3 smallIndexBlock(std::min(sliceSize, 128L)); + dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); - dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, 128L), (long)(mpc * 8))); - dim3 largeIndexBlock(std::min(srcTotalSize, 128L)); + dim3 largeIndexGrid(std::min(THCCeilDiv(srcTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 largeIndexBlock(std::min(srcTotalSize, (ptrdiff_t)128)); if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) && TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) && @@ -279,7 +279,7 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT dims = THCudaLongTensor_nDimension(state, indices); THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING); - long numIndices = THCudaLongTensor_nElement(state, indices); + ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices); long srcDims = THCTensor_(nDimension)(state, dst); cudaStream_t stream = THCState_getCurrentStream(state); @@ -296,9 +296,9 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT // total size of the tensor ignoring dimension `dim`; // -the number of indices we are choosing, which is the total size // of the tensor `indices`. - long dstTotalSize = THCTensor_(nElement)(state, dst); + ptrdiff_t dstTotalSize = THCTensor_(nElement)(state, dst); long dstFillDimSize = THCTensor_(size)(state, dst, dim); - long sliceSize = dstTotalSize / dstFillDimSize; + ptrdiff_t sliceSize = dstTotalSize / dstFillDimSize; int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; @@ -314,11 +314,11 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT dstInfo, indicesInfo, \ dstFillDim, sliceSize, dstFillDimSize, val); - dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8))); - dim3 smallIndexBlock(std::min(sliceSize, 128L)); + dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); - dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, 128L), (long)(mpc * 8))); - dim3 largeIndexBlock(std::min(dstTotalSize, 128L)); + dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128)); if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) && TensorUtils<THCudaLongTensor>::canUse32BitIndexMath(state, indices)) { @@ -396,7 +396,7 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in dims = THCudaLongTensor_nDimension(state, indices); THArgCheck(dims <= MAX_CUTORCH_DIMS, 5, CUTORCH_DIM_WARNING); - long numIndices = THCudaLongTensor_nElement(state, indices); + ptrdiff_t numIndices = THCudaLongTensor_nElement(state, indices); long srcDims = THCTensor_(nDimension)(state, src); cudaStream_t stream = THCState_getCurrentStream(state); @@ -418,9 +418,9 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in // total size of the tensor ignoring dimension `dim`; // -the number of indices we are choosing, which is the total size // of the tensor `indices`. - long dstTotalSize = THCTensor_(nElement)(state, dst); + ptrdiff_t dstTotalSize = THCTensor_(nElement)(state, dst); long srcSelectDimSize = THCTensor_(size)(state, src, dim); - long sliceSize = dstTotalSize / numIndices; + ptrdiff_t sliceSize = dstTotalSize / numIndices; int mpc = THCState_getCurrentDeviceProperties(state)->multiProcessorCount; @@ -436,11 +436,11 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in dstInfo, srcInfo, indicesInfo, \ dstSelectDim, srcSelectDim, dstTotalSize, sliceSize, srcSelectDimSize); - dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, 128L), (long)(mpc * 8))); - dim3 smallIndexBlock(std::min(sliceSize, 128L)); + dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); - dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, 128L), (long)(mpc * 8))); - dim3 largeIndexBlock(std::min(dstTotalSize, 128L)); + dim3 largeIndexGrid(std::min(THCCeilDiv(dstTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); + dim3 largeIndexBlock(std::min(dstTotalSize, (ptrdiff_t)128)); if (TensorUtils<THCTensor>::canUse32BitIndexMath(state, dst) && TensorUtils<THCTensor>::canUse32BitIndexMath(state, src) && diff --git a/lib/THC/generic/THCTensorMasked.cu b/lib/THC/generic/THCTensorMasked.cu index e6a5704..333af29 100644 --- a/lib/THC/generic/THCTensorMasked.cu +++ b/lib/THC/generic/THCTensorMasked.cu @@ -38,16 +38,16 @@ THCTensor_(maskedCopy)(THCState* state, THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src) { THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); - long maskSize = THCudaByteTensor_nElement(state, mask); - long tensorSize = THCTensor_(nElement)(state, tensor); - long srcSize = THCTensor_(nElement)(state, src); + ptrdiff_t maskSize = THCudaByteTensor_nElement(state, mask); + ptrdiff_t tensorSize = THCTensor_(nElement)(state, tensor); + ptrdiff_t srcSize = THCTensor_(nElement)(state, src); // `mask` and `tensor` must have the same number of elements THArgCheck(maskSize == tensorSize, 2, "mask and tensor must have the same number of elements"); // Determine our output size - long totalElements = THCudaByteTensor_sumall(state, mask); + ptrdiff_t totalElements = THCudaByteTensor_sumall(state, mask); // The number of `1` elements present in the mask must be <= the // number of elements available in `src` @@ -121,7 +121,7 @@ THCTensor_(maskedSelect)(THCState* state, 2, "sizes do not match"); // Determine our output size - long totalElements = THCudaByteTensor_sumall(state, mask); + ptrdiff_t totalElements = THCudaByteTensor_sumall(state, mask); THCTensor* tensorContig = THCTensor_(newContiguous)(state, tensor); THCTensor_(resize1d)(state, tensorContig, totalElements); diff --git a/lib/THC/generic/THCTensorMath.cu b/lib/THC/generic/THCTensorMath.cu index a0e550a..557f8f5 100644 --- a/lib/THC/generic/THCTensorMath.cu +++ b/lib/THC/generic/THCTensorMath.cu @@ -59,7 +59,7 @@ THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage THCTensor_(copy)(state, r_, t); } -long +ptrdiff_t THCTensor_(numel)(THCState *state, THCTensor *t) { return THCTensor_(nElement)(state, t); diff --git a/lib/THC/generic/THCTensorMath.h b/lib/THC/generic/THCTensorMath.h index 5c9e66d..cfc706a 100644 --- a/lib/THC/generic/THCTensorMath.h +++ b/lib/THC/generic/THCTensorMath.h @@ -8,6 +8,6 @@ THC_API void THCTensor_(zero)(THCState *state, THCTensor *self); THC_API void THCTensor_(zeros)(THCState *state, THCTensor *r_, THLongStorage *size); THC_API void THCTensor_(ones)(THCState *state, THCTensor *r_, THLongStorage *size); THC_API void THCTensor_(reshape)(THCState *state, THCTensor *r_, THCTensor *t, THLongStorage *size); -THC_API long THCTensor_(numel)(THCState *state, THCTensor *t); +THC_API ptrdiff_t THCTensor_(numel)(THCState *state, THCTensor *t); #endif diff --git a/lib/THC/generic/THCTensorMathPointwise.cu b/lib/THC/generic/THCTensorMathPointwise.cu index 90ddfbc..2638504 100644 --- a/lib/THC/generic/THCTensorMathPointwise.cu +++ b/lib/THC/generic/THCTensorMathPointwise.cu @@ -108,7 +108,7 @@ THCTensor_(cross)(THCState *state, THCTensor *self, THCTensor *x, THCTensor *y, int i; long nd = THCTensor_(nDimension)(state, x); - long nelem = THCTensor_(nElement)(state, x); + ptrdiff_t nelem = THCTensor_(nElement)(state, x); THArgCheck(nd == THCTensor_(nDimension)(state, y), 1, "tensors must have same number of dimensions"); for (i = 0; i < nd; i++) { THArgCheck(THCTensor_(size)(state, x, i) == THCTensor_(size)(state, y, i), 1, "dimension %i of x and y does not match", i); diff --git a/lib/THC/generic/THCTensorMathReduce.cu b/lib/THC/generic/THCTensorMathReduce.cu index 502fa75..1e21d03 100644 --- a/lib/THC/generic/THCTensorMathReduce.cu +++ b/lib/THC/generic/THCTensorMathReduce.cu @@ -47,7 +47,7 @@ THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, real value, THCTensor *self_; THCTensor *src_ = THCTensor_(newTranspose)(state, src, dimension, 0); THCTensor *data = THCTensor_(newClone)(state, src_); - long size = THCTensor_(nElement)(state, data)/data->size[0]; + ptrdiff_t size = THCTensor_(nElement)(state, data)/data->size[0]; THArgCheck(dimension >= 0 && dimension < THCTensor_(nDimension)(state, src), 3, "invalid dimension"); THArgCheck(THCNumerics<real>::gt(value, ScalarConvert<int, real>::to(0)), 2, "non-positive-norm not supported"); @@ -138,7 +138,7 @@ THCTensor_(varall)(THCState *state, THCTensor *self) val = THCNumerics<accreal>::div( val, - ScalarConvert<int, accreal>::to(THCTensor_(nElement)(state, self) - 1) + ScalarConvert<ptrdiff_t, accreal>::to(THCTensor_(nElement)(state, self) - 1) ); THCudaCheck(cudaGetLastError()); diff --git a/lib/THC/generic/THCTensorScatterGather.cu b/lib/THC/generic/THCTensorScatterGather.cu index 7842d3d..c120f88 100644 --- a/lib/THC/generic/THCTensorScatterGather.cu +++ b/lib/THC/generic/THCTensorScatterGather.cu @@ -34,7 +34,7 @@ void THCTensor_(gather)(THCState* state, THCTensor *tensor, 1, CUTORCH_DIM_WARNING); - const long totalElements = THCudaLongTensor_nElement(state, index); + const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); const dim3 block = getApplyBlock(); dim3 grid; THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING); @@ -126,7 +126,7 @@ void THCTensor_(scatter)(THCState* state, THCTensor *tensor, int dim, THCudaLong THArgCheck(THCTensor_(nDimension)(state, tensor) <= MAX_CUTORCH_DIMS, 1, CUTORCH_DIM_WARNING); - const long totalElements = THCudaLongTensor_nElement(state, index); + const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); const dim3 block = getApplyBlock(); dim3 grid; THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING); @@ -211,7 +211,7 @@ THCTensor_(scatterFill)(THCState* state, THCTensor *tensor, THArgCheck(THCTensor_(nDimension)(state, tensor) <= MAX_CUTORCH_DIMS, 1, CUTORCH_DIM_WARNING); - const long totalElements = THCudaLongTensor_nElement(state, index); + const ptrdiff_t totalElements = THCudaLongTensor_nElement(state, index); const dim3 block = getApplyBlock(); dim3 grid; THArgCheck(getApplyGrid(state, totalElements, grid), 1, CUTORCH_DIM_WARNING); diff --git a/lib/THC/generic/THCTensorSort.cu b/lib/THC/generic/THCTensorSort.cu index 0116522..046c62b 100644 --- a/lib/THC/generic/THCTensorSort.cu +++ b/lib/THC/generic/THCTensorSort.cu @@ -18,9 +18,9 @@ THC_API void THCTensor_(sortKeyValueInplace)(THCState* state, dims = THCTensor_(nDimension)(state, key); THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); - long inElements = THCTensor_(nElement)(state, key); + ptrdiff_t inElements = THCTensor_(nElement)(state, key); long keySliceSize = THCTensor_(size)(state, key, dim); - long keySlices = inElements / keySliceSize; + ptrdiff_t keySlices = inElements / keySliceSize; if (THCTensor_(nDimension)(state, key) == 0) { // Zero-dim tensor; do nothing @@ -160,7 +160,7 @@ void sortViaThrust(THCState* state, int dim, bool dir) { long nDims = THCTensor_(nDimension)(state, input); - long totalElements = THCTensor_(nElement)(state, input); + ptrdiff_t totalElements = THCTensor_(nElement)(state, input); long sliceSize = THCTensor_(size)(state, input, dim); long sliceStride = THCTensor_(stride)(state, input, dim); diff --git a/torch/generic/Storage.c b/torch/generic/Storage.c index c57c416..e5c0836 100644 --- a/torch/generic/Storage.c +++ b/torch/generic/Storage.c @@ -10,13 +10,13 @@ static int torch_Storage_(new)(lua_State *L) { const char *fileName = luaL_checkstring(L, 1); int isShared = luaT_optboolean(L, 2, 0); - long size = luaL_optlong(L, 3, 0); + ptrdiff_t size = luaL_optinteger(L, 3, 0); storage = THCStorage_(newWithMapping)(state, fileName, size, isShared); } else if(lua_type(L, 1) == LUA_TTABLE) { - long size = lua_objlen(L, 1); - long i; + ptrdiff_t size = lua_objlen(L, 1); + ptrdiff_t i; storage = THCStorage_(newWithSize)(state, size); for(i = 1; i <= size; i++) { @@ -39,11 +39,11 @@ static int torch_Storage_(new)(lua_State *L) { THCStorage *src = luaT_checkudata(L, 1, torch_Storage); real *ptr = src->data; - long offset = luaL_optlong(L, 2, 1) - 1; + ptrdiff_t offset = luaL_optinteger(L, 2, 1) - 1; if (offset < 0 || offset >= src->size) { luaL_error(L, "offset out of bounds"); } - long size = luaL_optlong(L, 3, src->size - offset); + ptrdiff_t size = luaL_optinteger(L, 3, src->size - offset); if (size < 1 || size > (src->size - offset)) { luaL_error(L, "size out of bounds"); } @@ -54,14 +54,14 @@ static int torch_Storage_(new)(lua_State *L) } else if(lua_type(L, 2) == LUA_TNUMBER) { - long size = luaL_optlong(L, 1, 0); + ptrdiff_t size = luaL_optinteger(L, 1, 0); real *ptr = (real *)luaL_optinteger(L, 2, 0); storage = THCStorage_(newWithData)(state, ptr, size); storage->flag = TH_STORAGE_REFCOUNTED; } else { - long size = luaL_optlong(L, 1, 0); + ptrdiff_t size = luaL_optinteger(L, 1, 0); storage = THCStorage_(newWithSize)(state, size); } luaT_pushudata(L, storage, torch_Storage); @@ -85,7 +85,7 @@ static int torch_Storage_(free)(lua_State *L) static int torch_Storage_(resize)(lua_State *L) { THCStorage *storage = luaT_checkudata(L, 1, torch_Storage); - long size = luaL_checklong(L, 2); + ptrdiff_t size = luaL_checkinteger(L, 2); /* int keepContent = luaT_optboolean(L, 3, 0); */ THCStorage_(resize)(cutorch_getstate(L), storage, size);/*, keepContent); */ lua_settop(L, 1); @@ -141,7 +141,7 @@ static int torch_Storage_(elementSize)(lua_State *L) static int torch_Storage_(__len__)(lua_State *L) { THCStorage *storage = luaT_checkudata(L, 1, torch_Storage); - lua_pushnumber(L, storage->size); + lua_pushinteger(L, storage->size); return 1; } @@ -150,7 +150,7 @@ static int torch_Storage_(__newindex__)(lua_State *L) if(lua_isnumber(L, 2)) { THCStorage *storage = luaT_checkudata(L, 1, torch_Storage); - long index = luaL_checklong(L, 2) - 1; + ptrdiff_t index = luaL_checkinteger(L, 2) - 1; double number = luaL_checknumber(L, 3); #ifdef THC_REAL_IS_HALF @@ -172,7 +172,7 @@ static int torch_Storage_(__index__)(lua_State *L) if(lua_isnumber(L, 2)) { THCStorage *storage = luaT_checkudata(L, 1, torch_Storage); - long index = luaL_checklong(L, 2) - 1; + ptrdiff_t index = luaL_checkinteger(L, 2) - 1; real v = THCStorage_(get)(cutorch_getstate(L), storage, index); #ifdef THC_REAL_IS_HALF @@ -196,7 +196,7 @@ static int torch_Storage_(totable)(lua_State *L) { THCState *state = cutorch_getstate(L); THCStorage *storage = luaT_checkudata(L, 1, torch_Storage); - long i; + ptrdiff_t i; /* Copy storage from device to host. */ #ifndef THC_REAL_IS_HALF @@ -235,6 +235,9 @@ static int torch_Storage_(write)(lua_State *L) THCStorage *storage = luaT_checkudata(L, 1, torch_Storage); THFile *file = luaT_checkudata(L, 2, "torch.File"); +#ifdef _MSC_VER + THAssert(storage->size < LONG_MAX); +#endif THFile_writeLongScalar(file, storage->size); THFile_writeRealRaw(file, storage->data, storage->size); diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c index b3c85fe..d7dcd53 100644 --- a/torch/generic/Tensor.c +++ b/torch/generic/Tensor.c @@ -5,7 +5,7 @@ #include "THCHalf.h" static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride, - THCStorage **storage_, long *storageOffset_, THLongStorage **size_, THLongStorage **stride_); + THCStorage **storage_, ptrdiff_t *storageOffset_, THLongStorage **size_, THLongStorage **stride_); static void torch_Tensor_(c_readSizeStride)(lua_State *L, int index, int allowStride, THLongStorage **size_, THLongStorage **stride_); @@ -75,7 +75,7 @@ static int torch_Tensor_(storage)(lua_State *L) static int torch_Tensor_(storageOffset)(lua_State *L) { THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor); - lua_pushnumber(L, tensor->storageOffset+1); + lua_pushinteger(L, tensor->storageOffset+1); return 1; } @@ -83,14 +83,14 @@ static int torch_Tensor_(new)(lua_State *L) { THCState *state = cutorch_getstate(L); THCTensor *tensor; - long storageOffset; + ptrdiff_t storageOffset; THLongStorage *size, *stride; if(lua_type(L, 1) == LUA_TTABLE) { - long i, j; + ptrdiff_t i, j; THLongStorage *counter; - long si = 0; + ptrdiff_t si = 0; int dimension = 0; int is_finished = 0; @@ -221,7 +221,7 @@ static int torch_Tensor_(set)(lua_State *L) { THCTensor *self = luaT_checkudata(L, 1, torch_Tensor); THCStorage *storage; - long storageOffset; + ptrdiff_t storageOffset; THLongStorage *size, *stride; torch_Tensor_(c_readTensorStorageSizeStride)(L, 2, 1, 1, 1, 1, @@ -691,7 +691,7 @@ static int torch_Tensor_(isSameSizeAs)(lua_State *L) static int torch_Tensor_(nElement)(lua_State *L) { THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor); - lua_pushnumber(L, THCTensor_(nElement)(cutorch_getstate(L), tensor)); + lua_pushinteger(L, THCTensor_(nElement)(cutorch_getstate(L), tensor)); return 1; } @@ -801,7 +801,7 @@ static int torch_Tensor_(__newindex__)(lua_State *L) } else if((idx = luaT_toudata(L, 2, "torch.LongStorage"))) { - long index = THCTensor_(storageOffset)(state, tensor); + ptrdiff_t index = THCTensor_(storageOffset)(state, tensor); #ifdef THC_REAL_IS_HALF real value = THC_float2half((float) luaL_checknumber(L,3)); @@ -809,7 +809,7 @@ static int torch_Tensor_(__newindex__)(lua_State *L) real value = (real)luaL_checknumber(L,3); #endif - int dim; + ptrdiff_t dim; luaL_argcheck(L, idx->size == tensor->nDimension, 2, "invalid size"); @@ -1014,7 +1014,7 @@ static int torch_Tensor_(__index__)(lua_State *L) if(lua_isnumber(L, 2)) { - long index = luaL_checklong(L,2)-1; + ptrdiff_t index = luaL_checkinteger(L,2)-1; luaL_argcheck(L, tensor->nDimension > 0, 1, "empty tensor"); if (index < 0) index = tensor->size[0] + index + 1; @@ -1045,8 +1045,8 @@ static int torch_Tensor_(__index__)(lua_State *L) } else if((idx = luaT_toudata(L, 2, "torch.LongStorage"))) { - long index = THCTensor_(storageOffset)(state, tensor); - int dim; + ptrdiff_t index = THCTensor_(storageOffset)(state, tensor); + ptrdiff_t dim; luaL_argcheck(L, idx->size == tensor->nDimension, 2, "invalid size"); @@ -1258,7 +1258,7 @@ static void torch_Tensor_(c_readSizeStride)(lua_State *L, int index, int allowSt } static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index, int allowNone, int allowTensor, int allowStorage, int allowStride, - THCStorage **storage_, long *storageOffset_, THLongStorage **size_, THLongStorage **stride_) + THCStorage **storage_, ptrdiff_t *storageOffset_, THLongStorage **size_, THLongStorage **stride_) { THCState *state = cutorch_getstate(L); THCTensor *src = NULL; @@ -1293,7 +1293,7 @@ static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index } else { - *storageOffset_ = luaL_checklong(L, index+1)-1; + *storageOffset_ = luaL_checkinteger(L, index+1)-1; torch_Tensor_(c_readSizeStride)(L, index+2, allowStride, size_, stride_); } return; @@ -1335,6 +1335,9 @@ static int torch_Tensor_(write)(lua_State *L) THFile_writeIntScalar(file, tensor->nDimension); THFile_writeLongRaw(file, tensor->size, tensor->nDimension); THFile_writeLongRaw(file, tensor->stride, tensor->nDimension); +#ifdef _MSC_VER + THAssert(tensor->storageOffset+1 < LONG_MAX); +#endif THFile_writeLongScalar(file, tensor->storageOffset+1); /* to respect Lua convention */ lua_getfield(L, 2, "writeObject"); /* the method */ |