diff options
Diffstat (limited to 'lib/THC/THCApply.cuh')
-rw-r--r-- | lib/THC/THCApply.cuh | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh index a47e303..e49a153 100644 --- a/lib/THC/THCApply.cuh +++ b/lib/THC/THCApply.cuh @@ -109,16 +109,16 @@ inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) { return false; } - // Assume a reasonable number of SMs if no state is available - int numSM = - state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15; - - // 16 warps per block * 4 per SM gives 64 warps per SM at maximum, - // which seems to be a good sweetspot for latency hiding - grid = dim3(min((long long) THCCeilDiv(totalElements, - (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), - 4LL * numSM)); + if(THCState_getCurrentDeviceProperties(state)->major < 3){ + grid = dim3(min((long long) THCCeilDiv(totalElements, + (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), (long long) 64*1024-1)); + return true; + } + + grid = dim3((long long) THCCeilDiv(totalElements, + (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK) ); return true; + } template <typename TensorTypeA, |