Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/cutorch.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Sarofeen <csarofeen@nvidia.com>2017-07-21 19:50:14 +0300
committerSoumith Chintala <soumith@gmail.com>2017-08-27 05:23:50 +0300
commitd891ff361afe33e118bc9798539453f8eb0000db (patch)
treef1cb3877c9f16938441b2ab9e5d8692f38a904bc
parent34cb2621266422b4e30c8bfb4ff7743a657799ce (diff)
Allowing larger grids for THCApply shows improved performance.
-rw-r--r--lib/THC/THCApply.cuh18
1 files changed, 9 insertions, 9 deletions
diff --git a/lib/THC/THCApply.cuh b/lib/THC/THCApply.cuh
index a47e303..e49a153 100644
--- a/lib/THC/THCApply.cuh
+++ b/lib/THC/THCApply.cuh
@@ -109,16 +109,16 @@ inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
return false;
}
- // Assume a reasonable number of SMs if no state is available
- int numSM =
- state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15;
-
- // 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
- // which seems to be a good sweetspot for latency hiding
- grid = dim3(min((long long) THCCeilDiv(totalElements,
- (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK),
- 4LL * numSM));
+ if(THCState_getCurrentDeviceProperties(state)->major < 3){
+ grid = dim3(min((long long) THCCeilDiv(totalElements,
+ (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), (long long) 64*1024-1));
+ return true;
+ }
+
+ grid = dim3((long long) THCCeilDiv(totalElements,
+ (ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK) );
return true;
+
}
template <typename TensorTypeA,