diff options
author | Christian Sarofeen <csarofeen@nvidia.com> | 2017-07-21 19:50:14 +0300 |
---|---|---|
committer | Soumith Chintala <soumith@gmail.com> | 2017-08-27 05:23:50 +0300 |
commit | d891ff361afe33e118bc9798539453f8eb0000db (patch) | |
tree | f1cb3877c9f16938441b2ab9e5d8692f38a904bc | |
parent | 34cb2621266422b4e30c8bfb4ff7743a657799ce (diff) |
Allowing larger grids for THCApply shows improved performance.
-rw-r--r-- | lib/THC/THCApply.cuh | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh index a47e303..e49a153 100644 --- a/lib/THC/THCApply.cuh +++ b/lib/THC/THCApply.cuh @@ -109,16 +109,16 @@ inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) { return false; } - // Assume a reasonable number of SMs if no state is available - int numSM = - state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15; - - // 16 warps per block * 4 per SM gives 64 warps per SM at maximum, - // which seems to be a good sweetspot for latency hiding - grid = dim3(min((long long) THCCeilDiv(totalElements, - (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), - 4LL * numSM)); + if(THCState_getCurrentDeviceProperties(state)->major < 3){ + grid = dim3(min((long long) THCCeilDiv(totalElements, + (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), (long long) 64*1024-1)); + return true; + } + + grid = dim3((long long) THCCeilDiv(totalElements, + (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK) ); return true; + } template <typename TensorTypeA, |